From a953e4597abd51b74c99e0e3b7074532a60fd031 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 12 May 2008 21:21:12 +0200
Subject: sched: replace MAX_NUMNODES with nr_node_ids in kernel/sched.c

  * Replace usages of MAX_NUMNODES with nr_node_ids in kernel/sched.c,
    where appropriate.  This saves some allocated space as well as many
    wasted cycles going through node entries that are non-existent.

For inclusion into sched-devel/latest tree.

Based on:
	git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
    +   sched-devel/latest  .../mingo/linux-2.6-sched-devel.git

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index cfa222a..1ed8011 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6879,9 +6879,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
 
 	min_val = INT_MAX;
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_node_ids; i++) {
 		/* Start at @node */
-		n = (node + i) % MAX_NUMNODES;
+		n = (node + i) % nr_node_ids;
 
 		if (!nr_cpus_node(n))
 			continue;
@@ -7075,7 +7075,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 		if (!sched_group_nodes)
 			continue;
 
-		for (i = 0; i < MAX_NUMNODES; i++) {
+		for (i = 0; i < nr_node_ids; i++) {
 			struct sched_group *oldsg, *sg = sched_group_nodes[i];
 
 			*nodemask = node_to_cpumask(i);
@@ -7263,7 +7263,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	/*
 	 * Allocate the per-node list of sched groups
 	 */
-	sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
+	sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
 				    GFP_KERNEL);
 	if (!sched_group_nodes) {
 		printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -7407,7 +7407,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
 
 	/* Set up physical groups */
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_node_ids; i++) {
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 
@@ -7431,7 +7431,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 					send_covered, tmpmask);
 	}
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_node_ids; i++) {
 		/* Set up node groups */
 		struct sched_group *sg, *prev;
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
@@ -7470,9 +7470,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		cpus_or(*covered, *covered, *nodemask);
 		prev = sg;
 
-		for (j = 0; j < MAX_NUMNODES; j++) {
+		for (j = 0; j < nr_node_ids; j++) {
 			SCHED_CPUMASK_VAR(notcovered, allmasks);
-			int n = (i + j) % MAX_NUMNODES;
+			int n = (i + j) % nr_node_ids;
 			node_to_cpumask_ptr(pnodemask, n);
 
 			cpus_complement(*notcovered, *covered);
@@ -7525,7 +7525,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	}
 
 #ifdef CONFIG_NUMA
-	for (i = 0; i < MAX_NUMNODES; i++)
+	for (i = 0; i < nr_node_ids; i++)
 		init_numa_sched_groups_power(sched_group_nodes[i]);
 
 	if (sd_allnodes) {
-- 
cgit v1.1


From 363ab6f1424cdea63e5d182312d60e19077b892a Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 12 May 2008 21:21:13 +0200
Subject: core: use performance variant for_each_cpu_mask_nr

Change references from for_each_cpu_mask to for_each_cpu_mask_nr
where appropriate

Reviewed-by: Paul Jackson <pj@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpu.c        |  2 +-
 kernel/rcuclassic.c |  2 +-
 kernel/rcupreempt.c | 10 +++++-----
 kernel/sched.c      | 36 ++++++++++++++++++------------------
 kernel/sched_fair.c |  2 +-
 kernel/sched_rt.c   |  6 +++---
 kernel/taskstats.c  |  4 ++--
 kernel/workqueue.c  |  6 +++---
 8 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index c77bc3a..50ae922 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -390,7 +390,7 @@ void __ref enable_nonboot_cpus(void)
 		goto out;
 
 	printk("Enabling non-boot CPUs ...\n");
-	for_each_cpu_mask(cpu, frozen_cpus) {
+	for_each_cpu_mask_nr(cpu, frozen_cpus) {
 		error = _cpu_up(cpu, 1);
 		if (!error) {
 			printk("CPU%d is up\n", cpu);
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index f4ffbd0..251358d 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -92,7 +92,7 @@ static void force_quiescent_state(struct rcu_data *rdp,
 		 */
 		cpumask = rcp->cpumask;
 		cpu_clear(rdp->cpu, cpumask);
-		for_each_cpu_mask(cpu, cpumask)
+		for_each_cpu_mask_nr(cpu, cpumask)
 			smp_send_reschedule(cpu);
 	}
 }
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index e1cdf19..18af270 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -657,7 +657,7 @@ rcu_try_flip_idle(void)
 
 	/* Now ask each CPU for acknowledgement of the flip. */
 
-	for_each_cpu_mask(cpu, rcu_cpu_online_map) {
+	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
 		per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
 		dyntick_save_progress_counter(cpu);
 	}
@@ -675,7 +675,7 @@ rcu_try_flip_waitack(void)
 	int cpu;
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
-	for_each_cpu_mask(cpu, rcu_cpu_online_map)
+	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
 		if (rcu_try_flip_waitack_needed(cpu) &&
 		    per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
 			RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
@@ -707,7 +707,7 @@ rcu_try_flip_waitzero(void)
 	/* Check to see if the sum of the "last" counters is zero. */
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
-	for_each_cpu_mask(cpu, rcu_cpu_online_map)
+	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
 		sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
 	if (sum != 0) {
 		RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
@@ -722,7 +722,7 @@ rcu_try_flip_waitzero(void)
 	smp_mb();  /*  ^^^^^^^^^^^^ */
 
 	/* Call for a memory barrier from each CPU. */
-	for_each_cpu_mask(cpu, rcu_cpu_online_map) {
+	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
 		per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
 		dyntick_save_progress_counter(cpu);
 	}
@@ -742,7 +742,7 @@ rcu_try_flip_waitmb(void)
 	int cpu;
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
-	for_each_cpu_mask(cpu, rcu_cpu_online_map)
+	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
 		if (rcu_try_flip_waitmb_needed(cpu) &&
 		    per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
 			RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
diff --git a/kernel/sched.c b/kernel/sched.c
index 1ed8011..814d6e1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2271,7 +2271,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 		/* Tally up the load of all CPUs in the group */
 		avg_load = 0;
 
-		for_each_cpu_mask(i, group->cpumask) {
+		for_each_cpu_mask_nr(i, group->cpumask) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
 				load = source_load(i, load_idx);
@@ -2313,7 +2313,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
 	/* Traverse only the allowed CPUs */
 	cpus_and(*tmp, group->cpumask, p->cpus_allowed);
 
-	for_each_cpu_mask(i, *tmp) {
+	for_each_cpu_mask_nr(i, *tmp) {
 		load = weighted_cpuload(i);
 
 		if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -3296,7 +3296,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		max_cpu_load = 0;
 		min_cpu_load = ~0UL;
 
-		for_each_cpu_mask(i, group->cpumask) {
+		for_each_cpu_mask_nr(i, group->cpumask) {
 			struct rq *rq;
 
 			if (!cpu_isset(i, *cpus))
@@ -3560,7 +3560,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 	unsigned long max_load = 0;
 	int i;
 
-	for_each_cpu_mask(i, group->cpumask) {
+	for_each_cpu_mask_nr(i, group->cpumask) {
 		unsigned long wl;
 
 		if (!cpu_isset(i, *cpus))
@@ -4100,7 +4100,7 @@ static void run_rebalance_domains(struct softirq_action *h)
 		int balance_cpu;
 
 		cpu_clear(this_cpu, cpus);
-		for_each_cpu_mask(balance_cpu, cpus) {
+		for_each_cpu_mask_nr(balance_cpu, cpus) {
 			/*
 			 * If this cpu gets work to do, stop the load balancing
 			 * work being done for other cpus. Next load
@@ -6832,7 +6832,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
 
 	cpus_clear(*covered);
 
-	for_each_cpu_mask(i, *span) {
+	for_each_cpu_mask_nr(i, *span) {
 		struct sched_group *sg;
 		int group = group_fn(i, cpu_map, &sg, tmpmask);
 		int j;
@@ -6843,7 +6843,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
 		cpus_clear(sg->cpumask);
 		sg->__cpu_power = 0;
 
-		for_each_cpu_mask(j, *span) {
+		for_each_cpu_mask_nr(j, *span) {
 			if (group_fn(j, cpu_map, NULL, tmpmask) != group)
 				continue;
 
@@ -7043,7 +7043,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 	if (!sg)
 		return;
 	do {
-		for_each_cpu_mask(j, sg->cpumask) {
+		for_each_cpu_mask_nr(j, sg->cpumask) {
 			struct sched_domain *sd;
 
 			sd = &per_cpu(phys_domains, j);
@@ -7068,7 +7068,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 {
 	int cpu, i;
 
-	for_each_cpu_mask(cpu, *cpu_map) {
+	for_each_cpu_mask_nr(cpu, *cpu_map) {
 		struct sched_group **sched_group_nodes
 			= sched_group_nodes_bycpu[cpu];
 
@@ -7302,7 +7302,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	/*
 	 * Set up domains for cpus specified by the cpu_map.
 	 */
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd = NULL, *p;
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
 
@@ -7374,7 +7374,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 
@@ -7391,7 +7391,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_MC
 	/* Set up multi-core groups */
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		SCHED_CPUMASK_VAR(this_core_map, allmasks);
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 
@@ -7458,7 +7458,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			goto error;
 		}
 		sched_group_nodes[i] = sg;
-		for_each_cpu_mask(j, *nodemask) {
+		for_each_cpu_mask_nr(j, *nodemask) {
 			struct sched_domain *sd;
 
 			sd = &per_cpu(node_domains, j);
@@ -7504,21 +7504,21 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 	/* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd = &per_cpu(cpu_domains, i);
 
 		init_sched_groups_power(i, sd);
 	}
 #endif
 #ifdef CONFIG_SCHED_MC
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd = &per_cpu(core_domains, i);
 
 		init_sched_groups_power(i, sd);
 	}
 #endif
 
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd = &per_cpu(phys_domains, i);
 
 		init_sched_groups_power(i, sd);
@@ -7538,7 +7538,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
 
 	/* Attach the domains */
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
@@ -7621,7 +7621,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
 
 	unregister_sched_domain_sysctl();
 
-	for_each_cpu_mask(i, *cpu_map)
+	for_each_cpu_mask_nr(i, *cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
 	synchronize_sched();
 	arch_destroy_sched_domains(cpu_map, &tmpmask);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e24ecd3..e398318 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1022,7 +1022,7 @@ static int wake_idle(int cpu, struct task_struct *p)
 		    || ((sd->flags & SD_WAKE_IDLE_FAR)
 			&& !task_hot(p, task_rq(p)->clock, sd))) {
 			cpus_and(tmp, sd->span, p->cpus_allowed);
-			for_each_cpu_mask(i, tmp) {
+			for_each_cpu_mask_nr(i, tmp) {
 				if (idle_cpu(i)) {
 					if (i != task_cpu(p)) {
 						schedstat_inc(p,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 060e87b..d73386c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -231,7 +231,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 		return 1;
 
 	span = sched_rt_period_mask();
-	for_each_cpu_mask(i, span) {
+	for_each_cpu_mask_nr(i, span) {
 		int enqueue = 0;
 		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 		struct rq *rq = rq_of_rt_rq(rt_rq);
@@ -272,7 +272,7 @@ static int balance_runtime(struct rt_rq *rt_rq)
 
 	spin_lock(&rt_b->rt_runtime_lock);
 	rt_period = ktime_to_ns(rt_b->rt_period);
-	for_each_cpu_mask(i, rd->span) {
+	for_each_cpu_mask_nr(i, rd->span) {
 		struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 		s64 diff;
 
@@ -1000,7 +1000,7 @@ static int pull_rt_task(struct rq *this_rq)
 
 	next = pick_next_task_rt(this_rq);
 
-	for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
+	for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) {
 		if (this_cpu == cpu)
 			continue;
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4a23517..06b1754 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -301,7 +301,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
 		return -EINVAL;
 
 	if (isadd == REGISTER) {
-		for_each_cpu_mask(cpu, mask) {
+		for_each_cpu_mask_nr(cpu, mask) {
 			s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
 					 cpu_to_node(cpu));
 			if (!s)
@@ -320,7 +320,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
 
 	/* Deregister or cleanup */
 cleanup:
-	for_each_cpu_mask(cpu, mask) {
+	for_each_cpu_mask_nr(cpu, mask) {
 		listeners = &per_cpu(listener_array, cpu);
 		down_write(&listeners->sem);
 		list_for_each_entry_safe(s, tmp, &listeners->list, list) {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 29fc39f..28c2b2c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -397,7 +397,7 @@ void flush_workqueue(struct workqueue_struct *wq)
 	might_sleep();
 	lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
 	lock_release(&wq->lockdep_map, 1, _THIS_IP_);
-	for_each_cpu_mask(cpu, *cpu_map)
+	for_each_cpu_mask_nr(cpu, *cpu_map)
 		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -477,7 +477,7 @@ static void wait_on_work(struct work_struct *work)
 	wq = cwq->wq;
 	cpu_map = wq_cpu_map(wq);
 
-	for_each_cpu_mask(cpu, *cpu_map)
+	for_each_cpu_mask_nr(cpu, *cpu_map)
 		wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
 
@@ -813,7 +813,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	list_del(&wq->list);
 	spin_unlock(&workqueue_lock);
 
-	for_each_cpu_mask(cpu, *cpu_map)
+	for_each_cpu_mask_nr(cpu, *cpu_map)
 		cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
 	put_online_cpus();
 
-- 
cgit v1.1


From cad0e458d17c643c20c1d38f45a1d26125e6a622 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 12 May 2008 21:21:13 +0200
Subject: clocksource/events: use performance variant for_each_cpu_mask_nr

Change references from for_each_cpu_mask to for_each_cpu_mask_nr
where appropriate

Reviewed-by: Paul Jackson <pj@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clocksource.c    | 4 ++--
 kernel/time/tick-broadcast.c | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index dadde53..60ceabd 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -145,9 +145,9 @@ static void clocksource_watchdog(unsigned long data)
 		 * Cycle through CPUs to check if the CPUs stay
 		 * synchronized to each other.
 		 */
-		int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map);
+		int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map);
 
-		if (next_cpu >= NR_CPUS)
+		if (next_cpu >= nr_cpu_ids)
 			next_cpu = first_cpu(cpu_online_map);
 		watchdog_timer.expires += WATCHDOG_INTERVAL;
 		add_timer_on(&watchdog_timer, next_cpu);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 57a1f02..2d0a963 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -397,8 +397,7 @@ again:
 	mask = CPU_MASK_NONE;
 	now = ktime_get();
 	/* Find all expired events */
-	for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
-	     cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
+	for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) {
 		td = &per_cpu(tick_cpu_device, cpu);
 		if (td->evtdev->next_event.tv64 <= now.tv64)
 			cpu_set(cpu, mask);
-- 
cgit v1.1


From 9c44bc03fff44ff04237a7d92e35304a0e50c331 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:21:04 +0200
Subject: softlockup: allow panic on lockup

allow users to configure the softlockup detector to generate a panic
instead of a warning message.

high-availability systems might opt for this strict method (combined
with panic_timeout= boot option/sysctl), instead of generating
softlockup warnings ad infinitum.

also, automated tests work better if the system reboots reliably (into
a safe kernel) in case of a lockup.

The full spectrum of configurability is supported: boot option, sysctl
option and Kconfig option.

it's default-disabled.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/softlockup.c | 21 +++++++++++++++++++++
 kernel/sysctl.c     | 11 +++++++++++
 2 files changed, 32 insertions(+)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 01b6522..78e0ad2 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -27,6 +27,21 @@ static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
 static int __read_mostly did_panic;
 unsigned long __read_mostly softlockup_thresh = 60;
 
+/*
+ * Should we panic (and reboot, if panic_timeout= is set) when a
+ * soft-lockup occurs:
+ */
+unsigned int __read_mostly softlockup_panic =
+				CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
+
+static int __init softlockup_panic_setup(char *str)
+{
+	softlockup_panic = simple_strtoul(str, NULL, 0);
+
+	return 1;
+}
+__setup("softlockup_panic=", softlockup_panic_setup);
+
 static int
 softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
 {
@@ -120,6 +135,9 @@ void softlockup_tick(void)
 	else
 		dump_stack();
 	spin_unlock(&print_lock);
+
+	if (softlockup_panic)
+		panic("softlockup: hung tasks");
 }
 
 /*
@@ -172,6 +190,9 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
 
 	t->last_switch_timestamp = now;
 	touch_nmi_watchdog();
+
+	if (softlockup_panic)
+		panic("softlockup: blocked tasks");
 }
 
 /*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2911665..2d3b388 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -729,6 +729,17 @@ static struct ctl_table kern_table[] = {
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 	{
 		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "softlockup_panic",
+		.data		= &softlockup_panic,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "softlockup_thresh",
 		.data		= &softlockup_thresh,
 		.maxlen		= sizeof(unsigned long),
-- 
cgit v1.1


From 9383d9679056e6cc4e7ff70f31da945a268238f4 Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Mon, 12 May 2008 21:21:14 +0200
Subject: softlockup: fix softlockup_thresh unaligned access and disable
 detection at runtime

Fix unaligned access errors when setting softlockup_thresh on
64 bit platforms.

Allow softlockup detection to be disabled by setting
softlockup_thresh <= 0.

Detect that boot time softlockup detection has been disabled
earlier in softlockup_tick.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/softlockup.c | 12 ++++++++++--
 kernel/sysctl.c     |  9 +++++----
 2 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 78e0ad2..a3a0b23 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -25,7 +25,7 @@ static DEFINE_PER_CPU(unsigned long, print_timestamp);
 static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
 
 static int __read_mostly did_panic;
-unsigned long __read_mostly softlockup_thresh = 60;
+int __read_mostly softlockup_thresh = 60;
 
 /*
  * Should we panic (and reboot, if panic_timeout= is set) when a
@@ -94,6 +94,14 @@ void softlockup_tick(void)
 	struct pt_regs *regs = get_irq_regs();
 	unsigned long now;
 
+	/* Is detection switched off? */
+	if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) {
+		/* Be sure we don't false trigger if switched back on */
+		if (touch_timestamp)
+			per_cpu(touch_timestamp, this_cpu) = 0;
+		return;
+	}
+
 	if (touch_timestamp == 0) {
 		touch_softlockup_watchdog();
 		return;
@@ -104,7 +112,7 @@ void softlockup_tick(void)
 	/* report at most once a second */
 	if ((print_timestamp >= touch_timestamp &&
 			print_timestamp < (touch_timestamp + 1)) ||
-			did_panic || !per_cpu(watchdog_task, this_cpu)) {
+			did_panic) {
 		return;
 	}
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2d3b388..31c19a7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -84,12 +84,13 @@ extern int latencytop_enabled;
 extern int sysctl_nr_open_min, sysctl_nr_open_max;
 
 /* Constants used for minimum and  maximum */
-#if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM)
+#ifdef CONFIG_HIGHMEM
 static int one = 1;
 #endif
 
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 static int sixty = 60;
+static int neg_one = -1;
 #endif
 
 #ifdef CONFIG_MMU
@@ -742,11 +743,11 @@ static struct ctl_table kern_table[] = {
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "softlockup_thresh",
 		.data		= &softlockup_thresh,
-		.maxlen		= sizeof(unsigned long),
+		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= &proc_dointvec_minmax,
 		.strategy	= &sysctl_intvec,
-		.extra1		= &one,
+		.extra1		= &neg_one,
 		.extra2		= &sixty,
 	},
 	{
-- 
cgit v1.1


From 1c4cd6dd1d0fd3057bb6b8c87460049497889d1b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:21:14 +0200
Subject: softlockup: fix softlockup_thresh fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sysctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 31c19a7..a829dc8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -84,7 +84,7 @@ extern int latencytop_enabled;
 extern int sysctl_nr_open_min, sysctl_nr_open_max;
 
 /* Constants used for minimum and  maximum */
-#ifdef CONFIG_HIGHMEM
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP)
 static int one = 1;
 #endif
 
-- 
cgit v1.1


From 02ff375590ac4140d88afc76505df1ad45c6af59 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 15:43:53 +0200
Subject: softlockup: fix false positives on nohz if CPU is 100% idle for more
 than 60 seconds

Fix (probably theoretical only) rq->clock update bug:
in tick_nohz_update_jiffies() [which is called on all irq
entry on all cpus where the irq entry hits an idle cpu] we
call touch_softlockup_watchdog() before we update jiffies.
That works fine most of the time when idle timeouts are within
60 seconds. But when an idle timeout is beyond 60 seconds,
jiffies is updated with a jump of more than 60 seconds,
which causes a jump in cpu-clock of more than 60 seconds,
triggering a false positive.

Reported-by: David Miller <davem@davemloft.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b854a89..28abad6 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -133,8 +133,6 @@ void tick_nohz_update_jiffies(void)
 	if (!ts->tick_stopped)
 		return;
 
-	touch_softlockup_watchdog();
-
 	cpu_clear(cpu, nohz_cpu_mask);
 	now = ktime_get();
 	ts->idle_waketime = now;
@@ -142,6 +140,8 @@ void tick_nohz_update_jiffies(void)
 	local_irq_save(flags);
 	tick_do_update_jiffies64(now);
 	local_irq_restore(flags);
+
+	touch_softlockup_watchdog();
 }
 
 void tick_nohz_stop_idle(int cpu)
-- 
cgit v1.1


From 8c2238eaaf0f774ca0f8d9daad7a616429bbb7f1 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Tue, 27 May 2008 12:23:29 -0500
Subject: softlockup: fix NMI hangs due to lock race - 2.6.26-rc regression

The touch_nmi_watchdog() routine on x86 ultimately calls
touch_softlockup_watchdog().  The problem is that to touch the
softlockup watchdog, the cpu_clock code has to be called which could
involve multiple cpu locks and can lead to a hard hang if one of the
locks is held by a processor that is not going to return anytime soon
(such as could be the case with kgdb or perhaps even with some other
kind of exception).

This patch causes the public version of the
touch_softlockup_watchdog() to defer the cpu clock access to a later
point.

The test case for this problem is to use the following kernel config
options:

CONFIG_KGDB_TESTS=y
CONFIG_KGDB_TESTS_ON_BOOT=y
CONFIG_KGDB_TESTS_BOOT_STRING="V1F100I100000"

It should be noted that kgdb test suite and these options were not
available until 2.6.26-rc2, so it was necessary to patch the kgdb
test suite during the bisection.

I would consider this patch a regression fix because the problem first
appeared in commit 27ec4407790d075c325e1f4da0a19c56953cce23 when some
logic was added to try to periodically sync the clocks.  It was
possible to work around this particular problem by simply not
performing the sync anytime the system was in a critical context.
This was ok until commit 3e51f33fcc7f55e6df25d15b55ed10c8b4da84cd,
which added config option CONFIG_HAVE_UNSTABLE_SCHED_CLOCK and some
multi-cpu locks to sync the clocks.  It became clear that accessing
this code from an nmi was the source of the lockups.  Avoiding the
access to the low level clock code from an code inside the NMI
processing also fixed the problem with the 27ec44... commit.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softlockup.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index a3a0b23..6b682d8 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -64,12 +64,17 @@ static unsigned long get_timestamp(int this_cpu)
 	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
 }
 
-void touch_softlockup_watchdog(void)
+static void __touch_softlockup_watchdog(void)
 {
 	int this_cpu = raw_smp_processor_id();
 
 	__raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu);
 }
+
+void touch_softlockup_watchdog(void)
+{
+	__raw_get_cpu_var(touch_timestamp) = 0;
+}
 EXPORT_SYMBOL(touch_softlockup_watchdog);
 
 void touch_all_softlockup_watchdogs(void)
@@ -103,7 +108,7 @@ void softlockup_tick(void)
 	}
 
 	if (touch_timestamp == 0) {
-		touch_softlockup_watchdog();
+		__touch_softlockup_watchdog();
 		return;
 	}
 
@@ -118,7 +123,7 @@ void softlockup_tick(void)
 
 	/* do not print during early bootup: */
 	if (unlikely(system_state != SYSTEM_RUNNING)) {
-		touch_softlockup_watchdog();
+		__touch_softlockup_watchdog();
 		return;
 	}
 
@@ -243,7 +248,7 @@ static int watchdog(void *__bind_cpu)
 	sched_setscheduler(current, SCHED_FIFO, &param);
 
 	/* initialize timestamp */
-	touch_softlockup_watchdog();
+	__touch_softlockup_watchdog();
 
 	set_current_state(TASK_INTERRUPTIBLE);
 	/*
@@ -252,7 +257,7 @@ static int watchdog(void *__bind_cpu)
 	 * debug-printout triggers in softlockup_tick().
 	 */
 	while (!kthread_should_stop()) {
-		touch_softlockup_watchdog();
+		__touch_softlockup_watchdog();
 		schedule();
 
 		if (kthread_should_stop())
-- 
cgit v1.1


From 688c91755dc3d3c03d8c67c1df13c02be258768e Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 16 Jun 2008 15:51:08 -0700
Subject: softlockup: print a module list on being stuck

Most places in the kernel that go BUG: print a module list
(which is very useful for doing statistics and finding patterns),
however the softlockup detector does not do this yet.

This patch adds the one line change to fix this gap.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softlockup.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 6b682d8..f2bf5de 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -143,6 +143,7 @@ void softlockup_tick(void)
 	printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
 			this_cpu, now - touch_timestamp,
 			current->comm, task_pid_nr(current));
+	print_modules();
 	if (regs)
 		show_regs(regs);
 	else
-- 
cgit v1.1


From 8d5be7f4e8515af461cbc8f07687ccc81507d508 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Wed, 25 Jun 2008 08:50:10 +0200
Subject: softlockup: show irqtrace

This patch adds some information about when interrupts were last
enabled and disabled to the output of the softlockup detector.

Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Johannes Weiner <hannes@saeurebad.de>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softlockup.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index f2bf5de..97977ec 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -13,6 +13,7 @@
 #include <linux/delay.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
+#include <linux/lockdep.h>
 #include <linux/notifier.h>
 #include <linux/module.h>
 
@@ -144,6 +145,7 @@ void softlockup_tick(void)
 			this_cpu, now - touch_timestamp,
 			current->comm, task_pid_nr(current));
 	print_modules();
+	print_irqtrace_events(current);
 	if (regs)
 		show_regs(regs);
 	else
-- 
cgit v1.1


From ee7e5516be4f2107535ad5a3d47d9c79f93661a2 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dbaryshkov@gmail.com>
Date: Sun, 29 Jun 2008 14:18:46 +0400
Subject: generic: per-device coherent dma allocator

Currently x86_32, sh and cris-v32 provide per-device coherent dma
memory allocator.

However their implementation is nearly identical. Refactor out
common code to be reused by them.

Signed-off-by: Dmitry Baryshkov <dbaryshkov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile       |   1 +
 kernel/dma-coherent.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 128 insertions(+)
 create mode 100644 kernel/dma-coherent.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938a..9e287d8 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
new file mode 100644
index 0000000..89a554c
--- /dev/null
+++ b/kernel/dma-coherent.c
@@ -0,0 +1,127 @@
+/*
+ * Coherent per-device memory handling.
+ * Borrowed from i386
+ */
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+
+struct dma_coherent_mem {
+	void		*virt_base;
+	u32		device_base;
+	int		size;
+	int		flags;
+	unsigned long	*bitmap;
+};
+
+int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
+				dma_addr_t device_addr, size_t size, int flags)
+{
+	void __iomem *mem_base = NULL;
+	int pages = size >> PAGE_SHIFT;
+	int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
+
+	if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
+		goto out;
+	if (!size)
+		goto out;
+	if (dev->dma_mem)
+		goto out;
+
+	/* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
+
+	mem_base = ioremap(bus_addr, size);
+	if (!mem_base)
+		goto out;
+
+	dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
+	if (!dev->dma_mem)
+		goto out;
+	dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+	if (!dev->dma_mem->bitmap)
+		goto free1_out;
+
+	dev->dma_mem->virt_base = mem_base;
+	dev->dma_mem->device_base = device_addr;
+	dev->dma_mem->size = pages;
+	dev->dma_mem->flags = flags;
+
+	if (flags & DMA_MEMORY_MAP)
+		return DMA_MEMORY_MAP;
+
+	return DMA_MEMORY_IO;
+
+ free1_out:
+	kfree(dev->dma_mem);
+ out:
+	if (mem_base)
+		iounmap(mem_base);
+	return 0;
+}
+EXPORT_SYMBOL(dma_declare_coherent_memory);
+
+void dma_release_declared_memory(struct device *dev)
+{
+	struct dma_coherent_mem *mem = dev->dma_mem;
+
+	if (!mem)
+		return;
+	dev->dma_mem = NULL;
+	iounmap(mem->virt_base);
+	kfree(mem->bitmap);
+	kfree(mem);
+}
+EXPORT_SYMBOL(dma_release_declared_memory);
+
+void *dma_mark_declared_memory_occupied(struct device *dev,
+					dma_addr_t device_addr, size_t size)
+{
+	struct dma_coherent_mem *mem = dev->dma_mem;
+	int pos, err;
+	int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
+
+	pages >>= PAGE_SHIFT;
+
+	if (!mem)
+		return ERR_PTR(-EINVAL);
+
+	pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
+	err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
+	if (err != 0)
+		return ERR_PTR(err);
+	return mem->virt_base + (pos << PAGE_SHIFT);
+}
+EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
+
+int dma_alloc_from_coherent(struct device *dev, ssize_t size,
+				       dma_addr_t *dma_handle, void **ret)
+{
+	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+	int order = get_order(size);
+
+	if (mem) {
+		int page = bitmap_find_free_region(mem->bitmap, mem->size,
+						     order);
+		if (page >= 0) {
+			*dma_handle = mem->device_base + (page << PAGE_SHIFT);
+			*ret = mem->virt_base + (page << PAGE_SHIFT);
+			memset(*ret, 0, size);
+		}
+		if (mem->flags & DMA_MEMORY_EXCLUSIVE)
+			*ret = NULL;
+	}
+	return (mem != NULL);
+}
+
+int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
+{
+	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+
+	if (mem && vaddr >= mem->virt_base && vaddr <
+		   (mem->virt_base + (mem->size << PAGE_SHIFT))) {
+		int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
+
+		bitmap_release_region(mem->bitmap, page, order);
+		return 1;
+	}
+	return 0;
+}
-- 
cgit v1.1


From dd7a1e5615b1719c0fdffee1ea5a7820ac8141a6 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@saeurebad.de>
Date: Fri, 27 Jun 2008 15:07:21 +0200
Subject: softlockup: fix watchdog task wakeup frequency

Updating the timestamp more often is pointless as we print the warnings
only if we exceed the threshold.  And the check for hung tasks relies on
the last timestamp, so it will keep working correctly, too.

Signed-off-by: Johannes Weiner <hannes@saeurebad.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softlockup.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 97977ec..d53ab70 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -130,8 +130,11 @@ void softlockup_tick(void)
 
 	now = get_timestamp(this_cpu);
 
-	/* Wake up the high-prio watchdog task every second: */
-	if (now > (touch_timestamp + 1))
+	/*
+	 * Wake up the high-prio watchdog task twice per
+	 * threshold timespan.
+	 */
+	if (now > touch_timestamp + softlockup_thresh/2)
 		wake_up_process(per_cpu(watchdog_task, this_cpu));
 
 	/* Warn about unreasonable delays: */
-- 
cgit v1.1


From 3e2f69fdd1b00166e7d589bce56b2d36a9e74374 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@saeurebad.de>
Date: Tue, 1 Jul 2008 09:12:04 +0200
Subject: softlockup: fix watchdog task wakeup frequency

The print_timestamp can never be bigger than the touch_timestamp, at
maximum it can be equal.  And if it is, the second check for
touch_timestamp + 1 bigger print_timestamp is always true, too.

The check for equality is sufficient as we proceed in one-second-steps
and are at least one second away from the last print-out if we have
another timestamp.

Signed-off-by: Johannes Weiner <hannes@saeurebad.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softlockup.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d53ab70..7bd8d1a 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -116,11 +116,8 @@ void softlockup_tick(void)
 	print_timestamp = per_cpu(print_timestamp, this_cpu);
 
 	/* report at most once a second */
-	if ((print_timestamp >= touch_timestamp &&
-			print_timestamp < (touch_timestamp + 1)) ||
-			did_panic) {
+	if (print_timestamp == touch_timestamp || did_panic)
 		return;
-	}
 
 	/* do not print during early bootup: */
 	if (unlikely(system_state != SYSTEM_RUNNING)) {
-- 
cgit v1.1


From 0775b3dbcb6d17b531b36df520ddab735647f3f7 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Tue, 8 Jul 2008 15:07:08 -0700
Subject: suspend, xen: enable PM_SLEEP for CONFIG_XEN

Xen save/restore requires PM_SLEEP to be set without requiring
SUSPEND or HIBERNATION.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/power/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index b45da40..1436c47 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -82,7 +82,7 @@ config PM_SLEEP_SMP
 
 config PM_SLEEP
 	bool
-	depends on SUSPEND || HIBERNATION
+	depends on SUSPEND || HIBERNATION || XEN
 	default y
 
 config SUSPEND
-- 
cgit v1.1


From 6717ef1aa750b54ddb9d8854b91707ee21f0ad23 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 9 Jul 2008 22:17:01 +0200
Subject: Revert "suspend, xen: enable PM_SLEEP for CONFIG_XEN"

This reverts commit 6fbbec428c8e7bb617da2e8a589af2e97bcf3bc4.

Rafael doesnt like it - it breaks various assumptions.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/power/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 1436c47..b45da40 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -82,7 +82,7 @@ config PM_SLEEP_SMP
 
 config PM_SLEEP
 	bool
-	depends on SUSPEND || HIBERNATION || XEN
+	depends on SUSPEND || HIBERNATION
 	default y
 
 config SUSPEND
-- 
cgit v1.1


From 93a0886e2368eafb9df5e2021fb185195cee88b2 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Tue, 15 Jul 2008 13:43:42 -0700
Subject: x86, xen, power: fix up config dependencies on PM

Xen save/restore needs bits of code enabled by PM_SLEEP, and PM_SLEEP
depends on PM.  So make XEN_SAVE_RESTORE depend on PM and PM_SLEEP
depend on XEN_SAVE_RESTORE.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/power/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index b45da40..59dfdf1 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -82,7 +82,7 @@ config PM_SLEEP_SMP
 
 config PM_SLEEP
 	bool
-	depends on SUSPEND || HIBERNATION
+	depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
 	default y
 
 config SUSPEND
-- 
cgit v1.1


From d88c16919793a9f5dc93e7956da5bb089c7600b4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 18 Jul 2008 08:59:24 +0200
Subject: Revert parts of "ftrace: do not trace scheduler functions"

the removal of -mno-spe in the !ftrace case was not intended.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 985ddb7..15ab63f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,8 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o
 
+CFLAGS_REMOVE_sched.o = -mno-spe
+
 ifdef CONFIG_FTRACE
 # Do not trace debug files and internal ftrace files
 CFLAGS_REMOVE_lockdep.o = -pg
-- 
cgit v1.1


From 13b40c1e40f3261e83ee514a08b77dbecb93021b Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Tue, 1 Jul 2008 10:32:50 -0700
Subject: sched: reduce stack size in isolated_cpu_setup()

  * Remove 16k stack requirements in isolated_cpu_setup when NR_CPUS=4096.

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 99e6d85..1ee18db 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6768,7 +6768,8 @@ static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
-	int ints[NR_CPUS], i;
+	static int __initdata ints[NR_CPUS];
+	int i;
 
 	str = get_options(str, ARRAY_SIZE(ints), ints);
 	cpus_clear(cpu_isolated_map);
-- 
cgit v1.1


From 7ebefa8ceefed44cc321be70afc54a585a68ac0b Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Tue, 1 Jul 2008 23:32:15 +0200
Subject: sched: rework of "prioritize non-migratable tasks over migratable
 ones"

(1) handle in a generic way all cases when a newly woken-up task is
not migratable (not just a corner case when "rt_se->nr_cpus_allowed ==
1")

(2) if current is to be preempted, then make sure "p" will be picked
up by pick_next_task_rt().
i.e. move task's group at the head of its list as well.

currently, it's not a case for the group-scheduling case as described
here: http://www.ussg.iu.edu/hypermail/linux/kernel/0807.0/0134.html

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 68 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 40 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 47ceac9..d3d1ccc 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -599,11 +599,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
 	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
 		return;
 
-	if (rt_se->nr_cpus_allowed == 1)
-		list_add(&rt_se->run_list, queue);
-	else
-		list_add_tail(&rt_se->run_list, queue);
-
+	list_add_tail(&rt_se->run_list, queue);
 	__set_bit(rt_se_prio(rt_se), array->bitmap);
 
 	inc_rt_tasks(rt_se, rt_rq);
@@ -688,32 +684,34 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
  * Put task to the end of the run list without the overhead of dequeue
  * followed by enqueue.
  */
-static
-void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+static void
+requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
 {
-	struct rt_prio_array *array = &rt_rq->active;
-
 	if (on_rt_rq(rt_se)) {
-		list_del_init(&rt_se->run_list);
-		list_add_tail(&rt_se->run_list,
-			      array->queue + rt_se_prio(rt_se));
+		struct rt_prio_array *array = &rt_rq->active;
+		struct list_head *queue = array->queue + rt_se_prio(rt_se);
+
+		if (head)
+			list_move(&rt_se->run_list, queue);
+		else
+			list_move_tail(&rt_se->run_list, queue);
 	}
 }
 
-static void requeue_task_rt(struct rq *rq, struct task_struct *p)
+static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
 {
 	struct sched_rt_entity *rt_se = &p->rt;
 	struct rt_rq *rt_rq;
 
 	for_each_sched_rt_entity(rt_se) {
 		rt_rq = rt_rq_of_se(rt_se);
-		requeue_rt_entity(rt_rq, rt_se);
+		requeue_rt_entity(rt_rq, rt_se, head);
 	}
 }
 
 static void yield_task_rt(struct rq *rq)
 {
-	requeue_task_rt(rq, rq->curr);
+	requeue_task_rt(rq, rq->curr, 0);
 }
 
 #ifdef CONFIG_SMP
@@ -753,6 +751,30 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 	 */
 	return task_cpu(p);
 }
+
+static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
+{
+	cpumask_t mask;
+
+	if (rq->curr->rt.nr_cpus_allowed == 1)
+		return;
+
+	if (p->rt.nr_cpus_allowed != 1
+	    && cpupri_find(&rq->rd->cpupri, p, &mask))
+		return;
+
+	if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
+		return;
+
+	/*
+	 * There appears to be other cpus that can accept
+	 * current and none to run 'p', so lets reschedule
+	 * to try and push current away:
+	 */
+	requeue_task_rt(rq, p, 1);
+	resched_task(rq->curr);
+}
+
 #endif /* CONFIG_SMP */
 
 /*
@@ -778,18 +800,8 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
 	 * to move current somewhere else, making room for our non-migratable
 	 * task.
 	 */
-	if((p->prio == rq->curr->prio)
-	   && p->rt.nr_cpus_allowed == 1
-	   && rq->curr->rt.nr_cpus_allowed != 1) {
-		cpumask_t mask;
-
-		if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
-			/*
-			 * There appears to be other cpus that can accept
-			 * current, so lets reschedule to try and push it away
-			 */
-			resched_task(rq->curr);
-	}
+	if (p->prio == rq->curr->prio && !need_resched())
+		check_preempt_equal_prio(rq, p);
 #endif
 }
 
@@ -1415,7 +1427,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 	 * on the queue:
 	 */
 	if (p->rt.run_list.prev != p->rt.run_list.next) {
-		requeue_task_rt(rq, p);
+		requeue_task_rt(rq, p, 0);
 		set_tsk_need_resched(p);
 	}
 }
-- 
cgit v1.1


From e761b7725234276a802322549cee5255305a0930 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qualcomm.com>
Date: Tue, 15 Jul 2008 04:43:49 -0700
Subject: cpu hotplug, sched: Introduce cpu_active_map and redo sched domain
 managment (take 2)

This is based on Linus' idea of creating cpu_active_map that prevents
scheduler load balancer from migrating tasks to the cpu that is going
down.

It allows us to simplify domain management code and avoid unecessary
domain rebuilds during cpu hotplug event handling.

Please ignore the cpusets part for now. It needs some more work in order
to avoid crazy lock nesting. Although I did simplfy and unify domain
reinitialization logic. We now simply call partition_sched_domains() in
all the cases. This means that we're using exact same code paths as in
cpusets case and hence the test below cover cpusets too.
Cpuset changes to make rebuild_sched_domains() callable from various
contexts are in the separate patch (right next after this one).

This not only boots but also easily handles
	while true; do make clean; make -j 8; done
and
	while true; do on-off-cpu 1; done
at the same time.
(on-off-cpu 1 simple does echo 0/1 > /sys/.../cpu1/online thing).

Suprisingly the box (dual-core Core2) is quite usable. In fact I'm typing
this on right now in gnome-terminal and things are moving just fine.

Also this is running with most of the debug features enabled (lockdep,
mutex, etc) no BUG_ONs or lockdep complaints so far.

I believe I addressed all of the Dmitry's comments for original Linus'
version. I changed both fair and rt balancer to mask out non-active cpus.
And replaced cpu_is_offline() with !cpu_active() in the main scheduler
code where it made sense (to me).

Signed-off-by: Max Krasnyanskiy <maxk@qualcomm.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Gregory Haskins <ghaskins@novell.com>
Cc: dmitry.adamushko@gmail.com
Cc: pj@sgi.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpu.c        |  30 ++++++++++++---
 kernel/cpuset.c     |   2 +-
 kernel/sched.c      | 108 ++++++++++++++++++++++------------------------------
 kernel/sched_fair.c |   3 ++
 kernel/sched_rt.c   |   7 ++++
 5 files changed, 80 insertions(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index cfb1d43..a1ac7ea 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -64,6 +64,8 @@ void __init cpu_hotplug_init(void)
 	cpu_hotplug.refcount = 0;
 }
 
+cpumask_t cpu_active_map;
+
 #ifdef CONFIG_HOTPLUG_CPU
 
 void get_online_cpus(void)
@@ -291,11 +293,20 @@ int __ref cpu_down(unsigned int cpu)
 	int err = 0;
 
 	cpu_maps_update_begin();
-	if (cpu_hotplug_disabled)
+
+	if (cpu_hotplug_disabled) {
 		err = -EBUSY;
-	else
-		err = _cpu_down(cpu, 0);
+		goto out;
+	}
+
+	cpu_clear(cpu, cpu_active_map);
+
+	err = _cpu_down(cpu, 0);
+
+	if (cpu_online(cpu))
+		cpu_set(cpu, cpu_active_map);
 
+out:
 	cpu_maps_update_done();
 	return err;
 }
@@ -355,11 +366,18 @@ int __cpuinit cpu_up(unsigned int cpu)
 	}
 
 	cpu_maps_update_begin();
-	if (cpu_hotplug_disabled)
+
+	if (cpu_hotplug_disabled) {
 		err = -EBUSY;
-	else
-		err = _cpu_up(cpu, 0);
+		goto out;
+	}
 
+	err = _cpu_up(cpu, 0);
+
+	if (cpu_online(cpu))
+		cpu_set(cpu, cpu_active_map);
+
+out:
 	cpu_maps_update_done();
 	return err;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 459d601..3c3ef02 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -564,7 +564,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
  *	partition_sched_domains().
  */
 
-static void rebuild_sched_domains(void)
+void rebuild_sched_domains(void)
 {
 	struct kfifo *q;	/* queue of cpusets to be scanned */
 	struct cpuset *cp;	/* scans q */
diff --git a/kernel/sched.c b/kernel/sched.c
index 1ee18db..c237624 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2881,7 +2881,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
 
 	rq = task_rq_lock(p, &flags);
 	if (!cpu_isset(dest_cpu, p->cpus_allowed)
-	    || unlikely(cpu_is_offline(dest_cpu)))
+	    || unlikely(!cpu_active(dest_cpu)))
 		goto out;
 
 	/* force the process onto the specified CPU */
@@ -3849,7 +3849,7 @@ int select_nohz_load_balancer(int stop_tick)
 		/*
 		 * If we are going offline and still the leader, give up!
 		 */
-		if (cpu_is_offline(cpu) &&
+		if (!cpu_active(cpu) &&
 		    atomic_read(&nohz.load_balancer) == cpu) {
 			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
 				BUG();
@@ -5876,7 +5876,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 	struct rq *rq_dest, *rq_src;
 	int ret = 0, on_rq;
 
-	if (unlikely(cpu_is_offline(dest_cpu)))
+	if (unlikely(!cpu_active(dest_cpu)))
 		return ret;
 
 	rq_src = cpu_rq(src_cpu);
@@ -7554,18 +7554,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
 }
 
 /*
- * Free current domain masks.
- * Called after all cpus are attached to NULL domain.
- */
-static void free_sched_domains(void)
-{
-	ndoms_cur = 0;
-	if (doms_cur != &fallback_doms)
-		kfree(doms_cur);
-	doms_cur = &fallback_doms;
-}
-
-/*
  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
  * For now this just excludes isolated cpus, but could be used to
  * exclude other special cases in the future.
@@ -7643,7 +7631,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  * ownership of it and will kfree it when done with it. If the caller
  * failed the kmalloc call, then it can pass in doms_new == NULL,
  * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms'.
+ * 'fallback_doms', it also forces the domains to be rebuilt.
  *
  * Call with hotplug lock held
  */
@@ -7657,12 +7645,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
 
-	if (doms_new == NULL) {
-		ndoms_new = 1;
-		doms_new = &fallback_doms;
-		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-		dattr_new = NULL;
-	}
+	if (doms_new == NULL)
+		ndoms_new = 0;
 
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
@@ -7677,6 +7661,14 @@ match1:
 		;
 	}
 
+	if (doms_new == NULL) {
+		ndoms_cur = 0;
+		ndoms_new = 1;
+		doms_new = &fallback_doms;
+		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+		dattr_new = NULL;
+	}
+
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < ndoms_cur; j++) {
@@ -7707,17 +7699,10 @@ match2:
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 int arch_reinit_sched_domains(void)
 {
-	int err;
-
 	get_online_cpus();
-	mutex_lock(&sched_domains_mutex);
-	detach_destroy_domains(&cpu_online_map);
-	free_sched_domains();
-	err = arch_init_sched_domains(&cpu_online_map);
-	mutex_unlock(&sched_domains_mutex);
+	rebuild_sched_domains();
 	put_online_cpus();
-
-	return err;
+	return 0;
 }
 
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
@@ -7783,59 +7768,49 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 
+#ifndef CONFIG_CPUSETS
 /*
- * Force a reinitialization of the sched domains hierarchy. The domains
- * and groups cannot be updated in place without racing with the balancing
- * code, so we temporarily attach all running cpus to the NULL domain
- * which will prevent rebalancing while the sched domains are recalculated.
+ * Add online and remove offline CPUs from the scheduler domains.
+ * When cpusets are enabled they take over this function.
  */
 static int update_sched_domains(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		partition_sched_domains(0, NULL, NULL);
+		return NOTIFY_OK;
+
+	default:
+		return NOTIFY_DONE;
+	}
+}
+#endif
+
+static int update_runtime(struct notifier_block *nfb,
+				unsigned long action, void *hcpu)
+{
 	int cpu = (int)(long)hcpu;
 
 	switch (action) {
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		disable_runtime(cpu_rq(cpu));
-		/* fall-through */
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		detach_destroy_domains(&cpu_online_map);
-		free_sched_domains();
 		return NOTIFY_OK;
 
-
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		enable_runtime(cpu_rq(cpu));
-		/* fall-through */
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		/*
-		 * Fall through and re-initialise the domains.
-		 */
-		break;
+		return NOTIFY_OK;
+
 	default:
 		return NOTIFY_DONE;
 	}
-
-#ifndef CONFIG_CPUSETS
-	/*
-	 * Create default domain partitioning if cpusets are disabled.
-	 * Otherwise we let cpusets rebuild the domains based on the
-	 * current setup.
-	 */
-
-	/* The hotplug lock is already held by cpu_up/cpu_down */
-	arch_init_sched_domains(&cpu_online_map);
-#endif
-
-	return NOTIFY_OK;
 }
 
 void __init sched_init_smp(void)
@@ -7855,8 +7830,15 @@ void __init sched_init_smp(void)
 		cpu_set(smp_processor_id(), non_isolated_cpus);
 	mutex_unlock(&sched_domains_mutex);
 	put_online_cpus();
+
+#ifndef CONFIG_CPUSETS
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
+#endif
+
+	/* RT runtime code needs to handle some hotplug events */
+	hotcpu_notifier(update_runtime, 0);
+
 	init_hrtick();
 
 	/* Move init over to a non-isolated CPU */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f2aa987..d924c67 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1004,6 +1004,8 @@ static void yield_task_fair(struct rq *rq)
  * not idle and an idle cpu is available.  The span of cpus to
  * search starts with cpus closest then further out as needed,
  * so we always favor a closer, idle cpu.
+ * Domains may include CPUs that are not usable for migration,
+ * hence we need to mask them out (cpu_active_map)
  *
  * Returns the CPU we should wake onto.
  */
@@ -1031,6 +1033,7 @@ static int wake_idle(int cpu, struct task_struct *p)
 		    || ((sd->flags & SD_WAKE_IDLE_FAR)
 			&& !task_hot(p, task_rq(p)->clock, sd))) {
 			cpus_and(tmp, sd->span, p->cpus_allowed);
+			cpus_and(tmp, tmp, cpu_active_map);
 			for_each_cpu_mask(i, tmp) {
 				if (idle_cpu(i)) {
 					if (i != task_cpu(p)) {
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d3d1ccc..50735bb 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -934,6 +934,13 @@ static int find_lowest_rq(struct task_struct *task)
 		return -1; /* No targets found */
 
 	/*
+	 * Only consider CPUs that are usable for migration.
+	 * I guess we might want to change cpupri_find() to ignore those
+	 * in the first place.
+	 */
+	cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
+
+	/*
 	 * At this point we have built a mask of cpus representing the
 	 * lowest priority tasks in the system.  Now we want to elect
 	 * the best one based on our affinity and topology.
-- 
cgit v1.1


From 39b0fad7121eace85770e7a4c6dc35dfd2879768 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qualcomm.com>
Date: Tue, 15 Jul 2008 20:56:26 -0700
Subject: cpu hotplug: Make cpu_active_map synchronization dependency clear

This goes on top of the cpu_active_map (take 2) patch.

Currently we depend on the stop_machine to provide nescessesary
synchronization for the cpu_active_map updates.
As Dmitry Adamushko pointed this is fragile and is not much clearer
than the previous scheme. In other words we do not want to depend on
the internal stop machine operation here.
So make the synchronization rules clear by doing synchronize_sched()
after clearing out cpu active bit.

Tested on quad-Core2 with:

   while true; do
      for i in 1 2 3; do
        echo 0 > /sys/devices/system/cpu/cpu$i/online
      done
      for i in 1 2 3; do
        echo 1 > /sys/devices/system/cpu/cpu$i/online
      done
   done
and
   stress -c 200

No lockdep, preempt or other complaints.

Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpu.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index a1ac7ea..033603c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -301,6 +301,16 @@ int __ref cpu_down(unsigned int cpu)
 
 	cpu_clear(cpu, cpu_active_map);
 
+	/*
+	 * Make sure the all cpus did the reschedule and are not
+	 * using stale version of the cpu_active_map.
+	 * This is not strictly necessary becuase stop_machine()
+	 * that we run down the line already provides the required
+	 * synchronization. But it's really a side effect and we do not
+	 * want to depend on the innards of the stop_machine here.
+	 */
+	synchronize_sched();
+
 	err = _cpu_down(cpu, 0);
 
 	if (cpu_online(cpu))
-- 
cgit v1.1


From e59494f441c834ca7aaa0e6fa6678ddbd3d72743 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 16 Jul 2008 00:13:45 -0400
Subject: ftrace: fix 4d3702b6 (post-v2.6.26): WARNING: at
 kernel/lockdep.c:2731 check_flags (ftrace)

On Wed, 16 Jul 2008, Vegard Nossum wrote:

> When booting 4d3702b6, I got this huge thing:
>
> Testing tracer wakeup: <4>------------[ cut here ]------------
> WARNING: at kernel/lockdep.c:2731 check_flags+0x123/0x160()
> Modules linked in:
> Pid: 1, comm: swapper Not tainted 2.6.26-crashing-02127-g4d3702b6 #30
>  [<c015c349>] warn_on_slowpath+0x59/0xb0
>  [<c01276c6>] ? ftrace_call+0x5/0x8
>  [<c012d800>] ? native_read_tsc+0x0/0x20
>  [<c0158de2>] ? sub_preempt_count+0x12/0xf0
>  [<c01814eb>] ? trace_hardirqs_off+0xb/0x10
>  [<c0182fbc>] ? __lock_acquire+0x2cc/0x1120
>  [<c01814eb>] ? trace_hardirqs_off+0xb/0x10
>  [<c01276af>] ? mcount_call+0x5/0xa
>  [<c017ff53>] check_flags+0x123/0x160
>  [<c0183e61>] lock_acquire+0x51/0xd0
>  [<c01276c6>] ? ftrace_call+0x5/0x8
>  [<c0613d4f>] _spin_lock_irqsave+0x5f/0xa0
>  [<c01a8d45>] ? ftrace_record_ip+0xf5/0x220
>  [<c02d5413>] ? debug_locks_off+0x3/0x50
>  [<c01a8d45>] ftrace_record_ip+0xf5/0x220
>  [<c01276af>] mcount_call+0x5/0xa
>  [<c02d5418>] ? debug_locks_off+0x8/0x50
>  [<c017ff27>] check_flags+0xf7/0x160
>  [<c0183e61>] lock_acquire+0x51/0xd0
>  [<c01276c6>] ? ftrace_call+0x5/0x8
>  [<c0613d4f>] _spin_lock_irqsave+0x5f/0xa0
>  [<c01affcd>] ? wakeup_tracer_call+0x6d/0xf0
>  [<c01625e2>] ? _local_bh_enable+0x62/0xb0
>  [<c0158ddd>] ? sub_preempt_count+0xd/0xf0
>  [<c01affcd>] wakeup_tracer_call+0x6d/0xf0
>  [<c0162724>] ? __do_softirq+0xf4/0x110
>  [<c01afff1>] ? wakeup_tracer_call+0x91/0xf0
>  [<c01276c6>] ftrace_call+0x5/0x8
>  [<c0162724>] ? __do_softirq+0xf4/0x110
>  [<c0158de2>] ? sub_preempt_count+0x12/0xf0
>  [<c01625e2>] _local_bh_enable+0x62/0xb0
>  [<c0162724>] __do_softirq+0xf4/0x110
>  [<c01627ed>] do_softirq+0xad/0xb0
>  [<c0162a15>] irq_exit+0xa5/0xb0
>  [<c013a506>] smp_apic_timer_interrupt+0x66/0xa0
>  [<c02d3fac>] ? trace_hardirqs_off_thunk+0xc/0x10
>  [<c0127449>] apic_timer_interrupt+0x2d/0x34
>  [<c018007b>] ? find_usage_backwards+0xb/0xf0
>  [<c0613a09>] ? _spin_unlock_irqrestore+0x69/0x80
>  [<c014ef32>] tg_shares_up+0x132/0x1d0
>  [<c014d2a2>] walk_tg_tree+0x62/0xa0
>  [<c014ee00>] ? tg_shares_up+0x0/0x1d0
>  [<c014a860>] ? tg_nop+0x0/0x10
>  [<c015499d>] update_shares+0x5d/0x80
>  [<c0154a2f>] try_to_wake_up+0x6f/0x280
>  [<c01a8b90>] ? __ftrace_modify_code+0x0/0xc0
>  [<c01a8b90>] ? __ftrace_modify_code+0x0/0xc0
>  [<c0154c94>] wake_up_process+0x14/0x20
>  [<c01725f6>] kthread_create+0x66/0xb0
>  [<c0195400>] ? do_stop+0x0/0x200
>  [<c0195320>] ? __stop_machine_run+0x30/0xb0
>  [<c0195340>] __stop_machine_run+0x50/0xb0
>  [<c0195400>] ? do_stop+0x0/0x200
>  [<c01a8b90>] ? __ftrace_modify_code+0x0/0xc0
>  [<c061242d>] ? mutex_unlock+0xd/0x10
>  [<c01953cc>] stop_machine_run+0x2c/0x60
>  [<c01a94d3>] unregister_ftrace_function+0x103/0x180
>  [<c01b0517>] stop_wakeup_tracer+0x17/0x60
>  [<c01b056f>] wakeup_tracer_ctrl_update+0xf/0x30
>  [<c01ab8d5>] trace_selftest_startup_wakeup+0xb5/0x130
>  [<c01ab950>] ? trace_wakeup_test_thread+0x0/0x70
>  [<c01aadf5>] register_tracer+0x135/0x1b0
>  [<c0877d02>] init_wakeup_tracer+0xd/0xf
>  [<c085d437>] kernel_init+0x1a9/0x2ce
>  [<c061397b>] ? _spin_unlock_irq+0x3b/0x60
>  [<c02d3f9c>] ? trace_hardirqs_on_thunk+0xc/0x10
>  [<c0877cf5>] ? init_wakeup_tracer+0x0/0xf
>  [<c0182646>] ? trace_hardirqs_on_caller+0x126/0x180
>  [<c02d3f9c>] ? trace_hardirqs_on_thunk+0xc/0x10
>  [<c01269c8>] ? restore_nocheck_notrace+0x0/0xe
>  [<c085d28e>] ? kernel_init+0x0/0x2ce
>  [<c085d28e>] ? kernel_init+0x0/0x2ce
>  [<c01275fb>] kernel_thread_helper+0x7/0x10
>  =======================
> ---[ end trace a7919e7f17c0a725 ]---
> irq event stamp: 579530
> hardirqs last  enabled at (579528): [<c01826ab>] trace_hardirqs_on+0xb/0x10
> hardirqs last disabled at (579529): [<c01814eb>] trace_hardirqs_off+0xb/0x10
> softirqs last  enabled at (579530): [<c0162724>] __do_softirq+0xf4/0x110
> softirqs last disabled at (579517): [<c01627ed>] do_softirq+0xad/0xb0
> irq event stamp: 579530
> hardirqs last  enabled at (579528): [<c01826ab>] trace_hardirqs_on+0xb/0x10
> hardirqs last disabled at (579529): [<c01814eb>] trace_hardirqs_off+0xb/0x10
> softirqs last  enabled at (579530): [<c0162724>] __do_softirq+0xf4/0x110
> softirqs last disabled at (579517): [<c01627ed>] do_softirq+0xad/0xb0
> PASSED
>
> Incidentally, the kernel also hung while I was typing in this report.

Things get weird between lockdep and ftrace because ftrace can be called
within lockdep internal code (via the mcount pointer) and lockdep can be
called with ftrace (via spin_locks).

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Tested-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_wakeup.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3c8d61d..e303ccb 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -26,7 +26,8 @@ static struct task_struct	*wakeup_task;
 static int			wakeup_cpu;
 static unsigned			wakeup_prio = -1;
 
-static DEFINE_SPINLOCK(wakeup_lock);
+static raw_spinlock_t wakeup_lock =
+	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 
 static void __wakeup_reset(struct trace_array *tr);
 
@@ -56,7 +57,8 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
 	if (unlikely(disabled != 1))
 		goto out;
 
-	spin_lock_irqsave(&wakeup_lock, flags);
+	local_irq_save(flags);
+	__raw_spin_lock(&wakeup_lock);
 
 	if (unlikely(!wakeup_task))
 		goto unlock;
@@ -71,7 +73,8 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
 	trace_function(tr, data, ip, parent_ip, flags);
 
  unlock:
-	spin_unlock_irqrestore(&wakeup_lock, flags);
+	__raw_spin_unlock(&wakeup_lock);
+	local_irq_restore(flags);
 
  out:
 	atomic_dec(&data->disabled);
@@ -145,7 +148,8 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
 	if (likely(disabled != 1))
 		goto out;
 
-	spin_lock_irqsave(&wakeup_lock, flags);
+	local_irq_save(flags);
+	__raw_spin_lock(&wakeup_lock);
 
 	/* We could race with grabbing wakeup_lock */
 	if (unlikely(!tracer_enabled || next != wakeup_task))
@@ -174,7 +178,8 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
 
 out_unlock:
 	__wakeup_reset(tr);
-	spin_unlock_irqrestore(&wakeup_lock, flags);
+	__raw_spin_unlock(&wakeup_lock);
+	local_irq_restore(flags);
 out:
 	atomic_dec(&tr->data[cpu]->disabled);
 }
@@ -209,8 +214,6 @@ static void __wakeup_reset(struct trace_array *tr)
 	struct trace_array_cpu *data;
 	int cpu;
 
-	assert_spin_locked(&wakeup_lock);
-
 	for_each_possible_cpu(cpu) {
 		data = tr->data[cpu];
 		tracing_reset(data);
@@ -229,9 +232,11 @@ static void wakeup_reset(struct trace_array *tr)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&wakeup_lock, flags);
+	local_irq_save(flags);
+	__raw_spin_lock(&wakeup_lock);
 	__wakeup_reset(tr);
-	spin_unlock_irqrestore(&wakeup_lock, flags);
+	__raw_spin_unlock(&wakeup_lock);
+	local_irq_restore(flags);
 }
 
 static void
@@ -252,7 +257,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p,
 		goto out;
 
 	/* interrupts should be off from try_to_wake_up */
-	spin_lock(&wakeup_lock);
+	__raw_spin_lock(&wakeup_lock);
 
 	/* check for races. */
 	if (!tracer_enabled || p->prio >= wakeup_prio)
@@ -274,7 +279,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p,
 		       CALLER_ADDR1, CALLER_ADDR2, flags);
 
 out_locked:
-	spin_unlock(&wakeup_lock);
+	__raw_spin_unlock(&wakeup_lock);
 out:
 	atomic_dec(&tr->data[cpu]->disabled);
 }
-- 
cgit v1.1


From 577b4a58d2e74a4d48050eeea3e3f952ce04eb86 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 11 Jul 2008 13:34:54 +0100
Subject: sched: fix warning in inc_rt_tasks() to not declare variable 'rq' if
 it's not needed

Fix inc_rt_tasks() to not declare variable 'rq' if it's not needed.  It is
declared if CONFIG_SMP or CONFIG_RT_GROUP_SCHED, but only used if CONFIG_SMP.

This is a consequence of patch 1f11eb6a8bc92536d9e93ead48fa3ffbd1478571 plus
patch 1100ac91b6af02d8639d518fad5b434b1bf44ed6.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 47ceac9..147004c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -505,7 +505,9 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq->rt_nr_running++;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
+#ifdef CONFIG_SMP
 		struct rq *rq = rq_of_rt_rq(rt_rq);
+#endif
 
 		rt_rq->highest_prio = rt_se_prio(rt_se);
 #ifdef CONFIG_SMP
-- 
cgit v1.1


From b8f8c3cf0a4ac0632ec3f0e15e9dc0c29de917af Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 Jul 2008 17:27:28 +0200
Subject: nohz: prevent tick stop outside of the idle loop

Jack Ren and Eric Miao tracked down the following long standing
problem in the NOHZ code:

	scheduler switch to idle task
	enable interrupts

Window starts here

	----> interrupt happens (does not set NEED_RESCHED)
	      	irq_exit() stops the tick

	----> interrupt happens (does set NEED_RESCHED)

	return from schedule()

	cpu_idle(): preempt_disable();

Window ends here

The interrupts can happen at any point inside the race window. The
first interrupt stops the tick, the second one causes the scheduler to
rerun and switch away from idle again and we end up with the tick
disabled.

The fact that it needs two interrupts where the first one does not set
NEED_RESCHED and the second one does made the bug obscure and extremly
hard to reproduce and analyse. Kudos to Jack and Eric.

Solution: Limit the NOHZ functionality to the idle loop to make sure
that we can not run into such a situation ever again.

cpu_idle()
{
	preempt_disable();

	while(1) {
		 tick_nohz_stop_sched_tick(1); <- tell NOHZ code that we
		 			          are in the idle loop

		 while (!need_resched())
		       halt();

		 tick_nohz_restart_sched_tick(); <- disables NOHZ mode
		 preempt_enable_no_resched();
		 schedule();
		 preempt_disable();
	}
}

In hindsight we should have done this forever, but ...

/me grabs a large brown paperbag.

Debugged-by: Jack Ren <jack.ren@marvell.com>,
Debugged-by: eric miao <eric.y.miao@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/softirq.c         |  2 +-
 kernel/time/tick-sched.c | 12 ++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 36e0617..05f2480 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -312,7 +312,7 @@ void irq_exit(void)
 #ifdef CONFIG_NO_HZ
 	/* Make sure that timer wheel updates are propagated */
 	if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
-		tick_nohz_stop_sched_tick();
+		tick_nohz_stop_sched_tick(0);
 	rcu_irq_exit();
 #endif
 	preempt_enable_no_resched();
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 86baa4f..ee962d1 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -195,7 +195,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
  * Called either from the idle loop or from irq_exit() when an idle period was
  * just interrupted by an interrupt which did not cause a reschedule.
  */
-void tick_nohz_stop_sched_tick(void)
+void tick_nohz_stop_sched_tick(int inidle)
 {
 	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
 	struct tick_sched *ts;
@@ -224,6 +224,11 @@ void tick_nohz_stop_sched_tick(void)
 	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
 		goto end;
 
+	if (!inidle && !ts->inidle)
+		goto end;
+
+	ts->inidle = 1;
+
 	if (need_resched())
 		goto end;
 
@@ -372,11 +377,14 @@ void tick_nohz_restart_sched_tick(void)
 	local_irq_disable();
 	tick_nohz_stop_idle(cpu);
 
-	if (!ts->tick_stopped) {
+	if (!ts->inidle || !ts->tick_stopped) {
+		ts->inidle = 0;
 		local_irq_enable();
 		return;
 	}
 
+	ts->inidle = 0;
+
 	rcu_exit_nohz();
 
 	/* Update jiffies first */
-- 
cgit v1.1


From 4dca10a96041f78bed11ce9e4a5cfde813ec4ccb Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Mon, 7 Jul 2008 18:37:04 -0700
Subject: softlockup: fix invalid proc_handler for softlockup_panic

The type of softlockup_panic is int, but the proc_handler is
proc_doulongvec_minmax(). This handler is for unsigned long.

This handler should be proc_dointvec_minmax().

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sysctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3e56d2f..ab59ac0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -741,7 +741,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &softlockup_panic,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= &proc_dointvec_minmax,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 		.extra2		= &one,
-- 
cgit v1.1


From 8df185a95c9b84fc0c3c02224e64fdc5b83bae34 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Tue, 8 Jul 2008 15:55:48 -0700
Subject: kthread: reduce stack pressure in create_kthread and kthreadd

  * Replace:

  	set_cpus_allowed(..., CPU_MASK_ALL)

    with:

  	set_cpus_allowed_ptr(..., CPU_MASK_ALL_PTR)

    to remove excessive stack requirements when NR_CPUS=4096.

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kthread.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index ac3fb73..6111c27 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -106,7 +106,7 @@ static void create_kthread(struct kthread_create_info *create)
 		 */
 		sched_setscheduler(create->result, SCHED_NORMAL, &param);
 		set_user_nice(create->result, KTHREAD_NICE_LEVEL);
-		set_cpus_allowed(create->result, CPU_MASK_ALL);
+		set_cpus_allowed_ptr(create->result, CPU_MASK_ALL_PTR);
 	}
 	complete(&create->done);
 }
@@ -233,7 +233,7 @@ int kthreadd(void *unused)
 	set_task_comm(tsk, "kthreadd");
 	ignore_signals(tsk);
 	set_user_nice(tsk, KTHREAD_NICE_LEVEL);
-	set_cpus_allowed(tsk, CPU_MASK_ALL);
+	set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR);
 
 	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
 
-- 
cgit v1.1


From 1e01cb0c6ff7e9ddb6547551794c6aa82785a7cb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 15 Jul 2008 09:53:37 -0400
Subject: ftrace: only trace preempt off with preempt tracer

When PREEMPT_TRACER and IRQSOFF_TRACER are both configured and irqsoff
tracer is running, the preempt_off sections might also be traced.

Thanks to Andrew Morton for pointing out my mistake of spin_lock disabling
interrupts while he was reviewing ftrace.txt. Seems that my example I used
actually hit this bug.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_irqsoff.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 421d6fe..b1e4a89 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -337,12 +337,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
 #ifdef CONFIG_PREEMPT_TRACER
 void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
-	stop_critical_timing(a0, a1);
+	if (preempt_trace())
+		stop_critical_timing(a0, a1);
 }
 
 void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
-	start_critical_timing(a0, a1);
+	if (preempt_trace())
+		start_critical_timing(a0, a1);
 }
 #endif /* CONFIG_PREEMPT_TRACER */
 
-- 
cgit v1.1


From 538c29d43ebdac2edcef96ac07982d2296a63077 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dbaryshkov@gmail.com>
Date: Fri, 18 Jul 2008 13:29:57 +0400
Subject: Generic dma-coherent: fix DMA_MEMORY_EXCLUSIVE

Don't rewrite successfull allocation return values
in case the memory was marked with DMA_MEMORY_EXCLUSIVE.

Signed-off-by: Dmitry Baryshkov <dbaryshkov@gmail.com>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/dma-coherent.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index 89a554c..56dff5c 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -105,8 +105,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
 			*dma_handle = mem->device_base + (page << PAGE_SHIFT);
 			*ret = mem->virt_base + (page << PAGE_SHIFT);
 			memset(*ret, 0, size);
-		}
-		if (mem->flags & DMA_MEMORY_EXCLUSIVE)
+		} else if (mem->flags & DMA_MEMORY_EXCLUSIVE)
 			*ret = NULL;
 	}
 	return (mem != NULL);
-- 
cgit v1.1


From 65c011845316d3c1381f478ca0d8265c43b3b039 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Tue, 15 Jul 2008 14:14:30 -0700
Subject: cpumask: Replace cpumask_of_cpu with cpumask_of_cpu_ptr

  * This patch replaces the dangerous lvalue version of cpumask_of_cpu
    with new cpumask_of_cpu_ptr macros.  These are patterned after the
    node_to_cpumask_ptr macros.

    In general terms, if there is a cpumask_of_cpu_map[] then a pointer to
    the cpumask_of_cpu_map[cpu] entry is used.  The cpumask_of_cpu_map
    is provided when there is a large NR_CPUS count, reducing
    greatly the amount of code generated and stack space used for
    cpumask_of_cpu().  The pointer to the cpumask_t value is needed for
    calling set_cpus_allowed_ptr() to reduce the amount of stack space
    needed to pass the cpumask_t value.

    If there isn't a cpumask_of_cpu_map[], then a temporary variable is
    declared and filled in with value from cpumask_of_cpu(cpu) as well as
    a pointer variable pointing to this temporary variable.  Afterwards,
    the pointer is used to reference the cpumask value.  The compiler
    will optimize out the extra dereference through the pointer as well
    as the stack space used for the pointer, resulting in identical code.

    A good example of the orthogonal usages is in net/sunrpc/svc.c:

	case SVC_POOL_PERCPU:
	{
		unsigned int cpu = m->pool_to[pidx];
		cpumask_of_cpu_ptr(cpumask, cpu);

		*oldmask = current->cpus_allowed;
		set_cpus_allowed_ptr(current, cpumask);
		return 1;
	}
	case SVC_POOL_PERNODE:
	{
		unsigned int node = m->pool_to[pidx];
		node_to_cpumask_ptr(nodecpumask, node);

		*oldmask = current->cpus_allowed;
		set_cpus_allowed_ptr(current, nodecpumask);
		return 1;
	}

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/stop_machine.c        | 3 ++-
 kernel/trace/trace_sysprof.c | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index ba9b205..738b411 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -33,8 +33,9 @@ static int stopmachine(void *cpu)
 {
 	int irqs_disabled = 0;
 	int prepared = 0;
+	cpumask_of_cpu_ptr(cpumask, (int)(long)cpu);
 
-	set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu));
+	set_cpus_allowed_ptr(current, cpumask);
 
 	/* Ack: we are alive */
 	smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 2301e1e..6352808 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -213,7 +213,9 @@ static void start_stack_timers(void)
 	int cpu;
 
 	for_each_online_cpu(cpu) {
-		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
+		cpumask_of_cpu_ptr(new_mask, cpu);
+
+		set_cpus_allowed_ptr(current, new_mask);
 		start_stack_timer(cpu);
 	}
 	set_cpus_allowed_ptr(current, &saved_mask);
-- 
cgit v1.1


From c18a41fbbc500ac0307ffd2b0ae73c2af9d0b0ab Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Tue, 15 Jul 2008 14:14:34 -0700
Subject: cpumask: Optimize cpumask_of_cpu in kernel/time/tick-common.c

  * Optimize various places where a pointer to the cpumask_of_cpu value
    will result in reducing stack pressure.

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-common.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 4f38865..bf43284 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -135,7 +135,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
  */
 static void tick_setup_device(struct tick_device *td,
 			      struct clock_event_device *newdev, int cpu,
-			      cpumask_t cpumask)
+			      const cpumask_t *cpumask)
 {
 	ktime_t next_event;
 	void (*handler)(struct clock_event_device *) = NULL;
@@ -169,8 +169,8 @@ static void tick_setup_device(struct tick_device *td,
 	 * When the device is not per cpu, pin the interrupt to the
 	 * current cpu:
 	 */
-	if (!cpus_equal(newdev->cpumask, cpumask))
-		irq_set_affinity(newdev->irq, cpumask);
+	if (!cpus_equal(newdev->cpumask, *cpumask))
+		irq_set_affinity(newdev->irq, *cpumask);
 
 	/*
 	 * When global broadcasting is active, check if the current
@@ -196,20 +196,20 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 	struct tick_device *td;
 	int cpu, ret = NOTIFY_OK;
 	unsigned long flags;
-	cpumask_t cpumask;
+	cpumask_of_cpu_ptr_declare(cpumask);
 
 	spin_lock_irqsave(&tick_device_lock, flags);
 
 	cpu = smp_processor_id();
+	cpumask_of_cpu_ptr_next(cpumask, cpu);
 	if (!cpu_isset(cpu, newdev->cpumask))
 		goto out_bc;
 
 	td = &per_cpu(tick_cpu_device, cpu);
 	curdev = td->evtdev;
-	cpumask = cpumask_of_cpu(cpu);
 
 	/* cpu local device ? */
-	if (!cpus_equal(newdev->cpumask, cpumask)) {
+	if (!cpus_equal(newdev->cpumask, *cpumask)) {
 
 		/*
 		 * If the cpu affinity of the device interrupt can not
@@ -222,7 +222,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 		 * If we have a cpu local device already, do not replace it
 		 * by a non cpu local device
 		 */
-		if (curdev && cpus_equal(curdev->cpumask, cpumask))
+		if (curdev && cpus_equal(curdev->cpumask, *cpumask))
 			goto out_bc;
 	}
 
-- 
cgit v1.1


From 31656519e132f6612584815f128c83976a9aaaef Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 18 Jul 2008 18:01:23 +0200
Subject: sched, x86: clean up hrtick implementation

random uvesafb failures were reported against Gentoo:

  http://bugs.gentoo.org/show_bug.cgi?id=222799

and Mihai Moldovan bisected it back to:

> 8f4d37ec073c17e2d4aa8851df5837d798606d6f is first bad commit
> commit 8f4d37ec073c17e2d4aa8851df5837d798606d6f
> Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Date:   Fri Jan 25 21:08:29 2008 +0100
>
>    sched: high-res preemption tick

Linus suspected it to be hrtick + vm86 interaction and observed:

> Btw, Peter, Ingo: I think that commit is doing bad things. They aren't
> _incorrect_ per se, but they are definitely bad.
>
> Why?
>
> Using random _TIF_WORK_MASK flags is really impolite for doing
> "scheduling" work. There's a reason that arch/x86/kernel/entry_32.S
> special-cases the _TIF_NEED_RESCHED flag: we don't want to exit out of
> vm86 mode unnecessarily.
>
> See the "work_notifysig_v86" label, and how it does that
> "save_v86_state()" thing etc etc.

Right, I never liked having to fiddle with those TIF flags. Initially I
needed it because the hrtimer base lock could not nest in the rq lock.
That however is fixed these days.

Currently the only reason left to fiddle with the TIF flags is remote
wakeups. We cannot program a remote cpu's hrtimer. I've been thinking
about using the new and improved IPI function call stuff to implement
hrtimer_start_on().

However that does require that smp_call_function_single(.wait=0) works
from interrupt context - /me looks at the latest series from Jens - Yes
that does seem to be supported, good.

Here's a stab at cleaning this stuff up ...

Mihai reported test success as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Mihai Moldovan <ionic@ionic.de>
Cc: Michal Januszewski <spock@gentoo.org>
Cc: Antonino Daplas <adaplas@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Kconfig.hz   |   2 +-
 kernel/sched.c      | 202 ++++++++++++++++------------------------------------
 kernel/sched_fair.c |   5 +-
 3 files changed, 63 insertions(+), 146 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 526128a..2a202a8 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
 	default 1000 if HZ_1000
 
 config SCHED_HRTICK
-	def_bool HIGH_RES_TIMERS && X86
+	def_bool HIGH_RES_TIMERS
diff --git a/kernel/sched.c b/kernel/sched.c
index 1ee18db..c13c75e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -571,8 +571,10 @@ struct rq {
 #endif
 
 #ifdef CONFIG_SCHED_HRTICK
-	unsigned long hrtick_flags;
-	ktime_t hrtick_expire;
+#ifdef CONFIG_SMP
+	int hrtick_csd_pending;
+	struct call_single_data hrtick_csd;
+#endif
 	struct hrtimer hrtick_timer;
 #endif
 
@@ -983,13 +985,6 @@ static struct rq *this_rq_lock(void)
 	return rq;
 }
 
-static void __resched_task(struct task_struct *p, int tif_bit);
-
-static inline void resched_task(struct task_struct *p)
-{
-	__resched_task(p, TIF_NEED_RESCHED);
-}
-
 #ifdef CONFIG_SCHED_HRTICK
 /*
  * Use HR-timers to deliver accurate preemption points.
@@ -1001,25 +996,6 @@ static inline void resched_task(struct task_struct *p)
  * When we get rescheduled we reprogram the hrtick_timer outside of the
  * rq->lock.
  */
-static inline void resched_hrt(struct task_struct *p)
-{
-	__resched_task(p, TIF_HRTICK_RESCHED);
-}
-
-static inline void resched_rq(struct rq *rq)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rq->lock, flags);
-	resched_task(rq->curr);
-	spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-enum {
-	HRTICK_SET,		/* re-programm hrtick_timer */
-	HRTICK_RESET,		/* not a new slice */
-	HRTICK_BLOCK,		/* stop hrtick operations */
-};
 
 /*
  * Use hrtick when:
@@ -1030,40 +1006,11 @@ static inline int hrtick_enabled(struct rq *rq)
 {
 	if (!sched_feat(HRTICK))
 		return 0;
-	if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags)))
+	if (!cpu_online(cpu_of(rq)))
 		return 0;
 	return hrtimer_is_hres_active(&rq->hrtick_timer);
 }
 
-/*
- * Called to set the hrtick timer state.
- *
- * called with rq->lock held and irqs disabled
- */
-static void hrtick_start(struct rq *rq, u64 delay, int reset)
-{
-	assert_spin_locked(&rq->lock);
-
-	/*
-	 * preempt at: now + delay
-	 */
-	rq->hrtick_expire =
-		ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
-	/*
-	 * indicate we need to program the timer
-	 */
-	__set_bit(HRTICK_SET, &rq->hrtick_flags);
-	if (reset)
-		__set_bit(HRTICK_RESET, &rq->hrtick_flags);
-
-	/*
-	 * New slices are called from the schedule path and don't need a
-	 * forced reschedule.
-	 */
-	if (reset)
-		resched_hrt(rq->curr);
-}
-
 static void hrtick_clear(struct rq *rq)
 {
 	if (hrtimer_active(&rq->hrtick_timer))
@@ -1071,32 +1018,6 @@ static void hrtick_clear(struct rq *rq)
 }
 
 /*
- * Update the timer from the possible pending state.
- */
-static void hrtick_set(struct rq *rq)
-{
-	ktime_t time;
-	int set, reset;
-	unsigned long flags;
-
-	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
-
-	spin_lock_irqsave(&rq->lock, flags);
-	set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
-	reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
-	time = rq->hrtick_expire;
-	clear_thread_flag(TIF_HRTICK_RESCHED);
-	spin_unlock_irqrestore(&rq->lock, flags);
-
-	if (set) {
-		hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
-		if (reset && !hrtimer_active(&rq->hrtick_timer))
-			resched_rq(rq);
-	} else
-		hrtick_clear(rq);
-}
-
-/*
  * High-resolution timer tick.
  * Runs from hardirq context with interrupts disabled.
  */
@@ -1115,27 +1036,37 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 }
 
 #ifdef CONFIG_SMP
-static void hotplug_hrtick_disable(int cpu)
+/*
+ * called from hardirq (IPI) context
+ */
+static void __hrtick_start(void *arg)
 {
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags;
-
-	spin_lock_irqsave(&rq->lock, flags);
-	rq->hrtick_flags = 0;
-	__set_bit(HRTICK_BLOCK, &rq->hrtick_flags);
-	spin_unlock_irqrestore(&rq->lock, flags);
+	struct rq *rq = arg;
 
-	hrtick_clear(rq);
+	spin_lock(&rq->lock);
+	hrtimer_restart(&rq->hrtick_timer);
+	rq->hrtick_csd_pending = 0;
+	spin_unlock(&rq->lock);
 }
 
-static void hotplug_hrtick_enable(int cpu)
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+static void hrtick_start(struct rq *rq, u64 delay)
 {
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags;
+	struct hrtimer *timer = &rq->hrtick_timer;
+	ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
 
-	spin_lock_irqsave(&rq->lock, flags);
-	__clear_bit(HRTICK_BLOCK, &rq->hrtick_flags);
-	spin_unlock_irqrestore(&rq->lock, flags);
+	timer->expires = time;
+
+	if (rq == this_rq()) {
+		hrtimer_restart(timer);
+	} else if (!rq->hrtick_csd_pending) {
+		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+		rq->hrtick_csd_pending = 1;
+	}
 }
 
 static int
@@ -1150,16 +1081,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_DOWN_PREPARE_FROZEN:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
-		hotplug_hrtick_disable(cpu);
-		return NOTIFY_OK;
-
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-	case CPU_DOWN_FAILED:
-	case CPU_DOWN_FAILED_FROZEN:
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-		hotplug_hrtick_enable(cpu);
+		hrtick_clear(cpu_rq(cpu));
 		return NOTIFY_OK;
 	}
 
@@ -1170,46 +1092,45 @@ static void init_hrtick(void)
 {
 	hotcpu_notifier(hotplug_hrtick, 0);
 }
-#endif /* CONFIG_SMP */
+#else
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+static void hrtick_start(struct rq *rq, u64 delay)
+{
+	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
+}
 
-static void init_rq_hrtick(struct rq *rq)
+static void init_hrtick(void)
 {
-	rq->hrtick_flags = 0;
-	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	rq->hrtick_timer.function = hrtick;
-	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 }
+#endif /* CONFIG_SMP */
 
-void hrtick_resched(void)
+static void init_rq_hrtick(struct rq *rq)
 {
-	struct rq *rq;
-	unsigned long flags;
+#ifdef CONFIG_SMP
+	rq->hrtick_csd_pending = 0;
 
-	if (!test_thread_flag(TIF_HRTICK_RESCHED))
-		return;
+	rq->hrtick_csd.flags = 0;
+	rq->hrtick_csd.func = __hrtick_start;
+	rq->hrtick_csd.info = rq;
+#endif
 
-	local_irq_save(flags);
-	rq = cpu_rq(smp_processor_id());
-	hrtick_set(rq);
-	local_irq_restore(flags);
+	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rq->hrtick_timer.function = hrtick;
+	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 }
 #else
 static inline void hrtick_clear(struct rq *rq)
 {
 }
 
-static inline void hrtick_set(struct rq *rq)
-{
-}
-
 static inline void init_rq_hrtick(struct rq *rq)
 {
 }
 
-void hrtick_resched(void)
-{
-}
-
 static inline void init_hrtick(void)
 {
 }
@@ -1228,16 +1149,16 @@ static inline void init_hrtick(void)
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
 
-static void __resched_task(struct task_struct *p, int tif_bit)
+static void resched_task(struct task_struct *p)
 {
 	int cpu;
 
 	assert_spin_locked(&task_rq(p)->lock);
 
-	if (unlikely(test_tsk_thread_flag(p, tif_bit)))
+	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
 		return;
 
-	set_tsk_thread_flag(p, tif_bit);
+	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
 
 	cpu = task_cpu(p);
 	if (cpu == smp_processor_id())
@@ -1303,10 +1224,10 @@ void wake_up_idle_cpu(int cpu)
 #endif /* CONFIG_NO_HZ */
 
 #else /* !CONFIG_SMP */
-static void __resched_task(struct task_struct *p, int tif_bit)
+static void resched_task(struct task_struct *p)
 {
 	assert_spin_locked(&task_rq(p)->lock);
-	set_tsk_thread_flag(p, tif_bit);
+	set_tsk_need_resched(p);
 }
 #endif /* CONFIG_SMP */
 
@@ -4395,7 +4316,7 @@ asmlinkage void __sched schedule(void)
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
-	int cpu, hrtick = sched_feat(HRTICK);
+	int cpu;
 
 need_resched:
 	preempt_disable();
@@ -4410,7 +4331,7 @@ need_resched_nonpreemptible:
 
 	schedule_debug(prev);
 
-	if (hrtick)
+	if (sched_feat(HRTICK))
 		hrtick_clear(rq);
 
 	/*
@@ -4457,9 +4378,6 @@ need_resched_nonpreemptible:
 	} else
 		spin_unlock_irq(&rq->lock);
 
-	if (hrtick)
-		hrtick_set(rq);
-
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f2aa987..6893b3e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -878,7 +878,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 #ifdef CONFIG_SCHED_HRTICK
 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 {
-	int requeue = rq->curr == p;
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
@@ -899,10 +898,10 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 		 * Don't schedule slices shorter than 10000ns, that just
 		 * doesn't make sense. Rely on vruntime for fairness.
 		 */
-		if (!requeue)
+		if (rq->curr != p)
 			delta = max(10000LL, delta);
 
-		hrtick_start(rq, delta, requeue);
+		hrtick_start(rq, delta);
 	}
 }
 #else /* !CONFIG_SCHED_HRTICK */
-- 
cgit v1.1


From ba42059fbd0aa1ac91b582412b5fedb1258f241f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 20 Jul 2008 11:02:06 +0200
Subject: sched: hrtick_enabled() should use cpu_active()

Peter pointed out that hrtick_enabled() should use cpu_active().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 85cf246..62b1b8e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1006,7 +1006,7 @@ static inline int hrtick_enabled(struct rq *rq)
 {
 	if (!sched_feat(HRTICK))
 		return 0;
-	if (!cpu_online(cpu_of(rq)))
+	if (!cpu_active(cpu_of(rq)))
 		return 0;
 	return hrtimer_is_hres_active(&rq->hrtick_timer);
 }
-- 
cgit v1.1


From b6d4f7e3ef25beb8c658c97867d98883e69dc544 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dbaryshkov@gmail.com>
Date: Sun, 20 Jul 2008 15:01:10 +0400
Subject: dma-coherent: add documentation to new interfaces

Signed-off-by: Dmitry Baryshkov <dbaryshkov@gmail.com>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/dma-coherent.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'kernel')

diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index 56dff5c..7517115 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -92,6 +92,21 @@ void *dma_mark_declared_memory_occupied(struct device *dev,
 }
 EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
 
+/**
+ * Try to allocate memory from the per-device coherent area.
+ *
+ * @dev:	device from which we allocate memory
+ * @size:	size of requested memory area
+ * @dma_handle:	This will be filled with the correct dma handle
+ * @ret:	This pointer will be filled with the virtual address
+ * 		to allocated area.
+ *
+ * This function should be only called from per-arch %dma_alloc_coherent()
+ * to support allocation from per-device coherent memory pools.
+ *
+ * Returns 0 if dma_alloc_coherent should continue with allocating from
+ * generic memory areas, or !0 if dma_alloc_coherent should return %ret.
+ */
 int dma_alloc_from_coherent(struct device *dev, ssize_t size,
 				       dma_addr_t *dma_handle, void **ret)
 {
@@ -111,6 +126,19 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
 	return (mem != NULL);
 }
 
+/**
+ * Try to free the memory allocated from per-device coherent memory pool.
+ * @dev:	device from which the memory was allocated
+ * @order:	the order of pages allocated
+ * @vaddr:	virtual address of allocated pages
+ *
+ * This checks whether the memory was allocated from the per-device
+ * coherent memory pool and if so, releases that memory.
+ *
+ * Returns 1 if we correctly released the memory, or 0 if
+ * %dma_release_coherent() should proceed with releasing memory from
+ * generic pools.
+ */
 int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
 {
 	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
-- 
cgit v1.1


From 4a0b2b4dbe1335b8b9886ba3dc85a145d5d938ed Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Tue, 1 Jul 2008 18:48:41 +0200
Subject: sysdev: Pass the attribute to the low level sysdev show/store
 function

This allow to dynamically generate attributes and share show/store
functions between attributes. Right now most attributes are generated
by special macros and lots of duplicated code. With the attribute
passed it's instead possible to attach some data to the attribute
and then use that in shared low level functions to do different things.

I need this for the dynamically generated bank attributes in the x86
machine check code, but it'll allow some further cleanups.

I converted all users in tree to the new show/store prototype. It's a single
huge patch to avoid unbisectable sections.

Runtime tested: x86-32, x86-64
Compiled only: ia64, powerpc
Not compile tested/only grep converted: sh, arm, avr32

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/rtmutex-tester.c   | 7 ++++---
 kernel/sched.c            | 8 ++++++--
 kernel/time/clocksource.c | 8 ++++++--
 3 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 092e4c6..a56f629 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -297,8 +297,8 @@ static int test_func(void *data)
  *
  * opcode:data
  */
-static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
-				  size_t count)
+static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr,
+				  const char *buf, size_t count)
 {
 	struct sched_param schedpar;
 	struct test_thread_data *td;
@@ -360,7 +360,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
  * @dev:	thread to query
  * @buf:	char buffer to be filled with thread status info
  */
-static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
+static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr,
+				 char *buf)
 {
 	struct test_thread_data *td;
 	struct task_struct *tsk;
diff --git a/kernel/sched.c b/kernel/sched.c
index 99e6d85..b1104ea 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7737,11 +7737,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 }
 
 #ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
+static ssize_t sched_mc_power_savings_show(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *page)
 {
 	return sprintf(page, "%u\n", sched_mc_power_savings);
 }
 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
+					    struct sysdev_attribute *attr,
 					    const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 0);
@@ -7751,11 +7753,13 @@ static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
 #endif
 
 #ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
+static ssize_t sched_smt_power_savings_show(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *page)
 {
 	return sprintf(page, "%u\n", sched_smt_power_savings);
 }
 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
+					     struct sysdev_attribute *attr,
 					     const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 1);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index dadde53..b1c2da8 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -376,7 +376,8 @@ void clocksource_unregister(struct clocksource *cs)
  * Provides sysfs interface for listing current clocksource.
  */
 static ssize_t
-sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
+sysfs_show_current_clocksources(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	ssize_t count = 0;
 
@@ -397,6 +398,7 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
  * clocksource selction.
  */
 static ssize_t sysfs_override_clocksource(struct sys_device *dev,
+					  struct sysdev_attribute *attr,
 					  const char *buf, size_t count)
 {
 	struct clocksource *ovr = NULL;
@@ -449,7 +451,9 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
  * Provides sysfs interface for listing registered clocksources
  */
 static ssize_t
-sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
+sysfs_show_available_clocksources(struct sys_device *dev,
+				  struct sysdev_attribute *attr,
+				  char *buf)
 {
 	struct clocksource *src;
 	ssize_t count = 0;
-- 
cgit v1.1


From da39ba5e1d65e997a98f6eb93ba6e6eb505f6e3c Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 22 Jul 2008 19:24:25 -0500
Subject: module: don't use stop_machine for waiting rmmod

rmmod has a little-used "-w" option, meaning that instead of failing if the
module is in use, it should block until the module becomes unused.

In this case, we don't need to use stop_machine: Max Krasnyansky
indicated that would be useful for SystemTap which loads/unloads new
modules frequently.

Cc: Max Krasnyansky <maxk@qualcomm.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5f80478..705e1d5 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -639,8 +639,8 @@ static int __try_stop_module(void *_sref)
 {
 	struct stopref *sref = _sref;
 
-	/* If it's not unused, quit unless we are told to block. */
-	if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) {
+	/* If it's not unused, quit unless we're forcing. */
+	if (module_refcount(sref->mod) != 0) {
 		if (!(*sref->forced = try_force_unload(sref->flags)))
 			return -EWOULDBLOCK;
 	}
@@ -652,9 +652,16 @@ static int __try_stop_module(void *_sref)
 
 static int try_stop_module(struct module *mod, int flags, int *forced)
 {
-	struct stopref sref = { mod, flags, forced };
+	if (flags & O_NONBLOCK) {
+		struct stopref sref = { mod, flags, forced };
 
-	return stop_machine_run(__try_stop_module, &sref, NR_CPUS);
+		return stop_machine_run(__try_stop_module, &sref, NR_CPUS);
+	} else {
+		/* We don't need to stop the machine for this. */
+		mod->state = MODULE_STATE_GOING;
+		synchronize_sched();
+		return 0;
+	}
 }
 
 unsigned int module_refcount(struct module *mod)
-- 
cgit v1.1


From dafd0940c96fec67974a88ed8e6b8ba3160394cd Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 22 Jul 2008 19:24:25 -0500
Subject: module: generic each_symbol iterator function

Introduce an each_symbol() iterator to avoid duplicating the knowledge
about the 5 different sections containing symbols.  Currently only
used by find_symbol(), but will be used by symbol_put_addr() too.

(Includes NULL ptr deref fix by Jiri Kosina <jkosina@suse.cz>)

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jiri Kosina <jkosina@suse.cz>
---
 kernel/module.c | 244 +++++++++++++++++++++++++++++++-------------------------
 1 file changed, 134 insertions(+), 110 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 705e1d5..c51c089 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -152,156 +152,180 @@ extern const unsigned long __start___kcrctab_unused_gpl[];
 #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
 #endif
 
-/* lookup symbol in given range of kernel_symbols */
-static const struct kernel_symbol *lookup_symbol(const char *name,
-	const struct kernel_symbol *start,
-	const struct kernel_symbol *stop)
-{
-	const struct kernel_symbol *ks = start;
-	for (; ks < stop; ks++)
-		if (strcmp(ks->name, name) == 0)
-			return ks;
-	return NULL;
-}
-
-static bool always_ok(bool gplok, bool warn, const char *name)
-{
-	return true;
-}
-
-static bool printk_unused_warning(bool gplok, bool warn, const char *name)
-{
-	if (warn) {
-		printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
-		       "however this module is using it.\n", name);
-		printk(KERN_WARNING
-		       "This symbol will go away in the future.\n");
-		printk(KERN_WARNING
-		       "Please evalute if this is the right api to use and if "
-		       "it really is, submit a report the linux kernel "
-		       "mailinglist together with submitting your code for "
-		       "inclusion.\n");
-	}
-	return true;
-}
-
-static bool gpl_only_unused_warning(bool gplok, bool warn, const char *name)
-{
-	if (!gplok)
-		return false;
-	return printk_unused_warning(gplok, warn, name);
-}
-
-static bool gpl_only(bool gplok, bool warn, const char *name)
-{
-	return gplok;
-}
-
-static bool warn_if_not_gpl(bool gplok, bool warn, const char *name)
-{
-	if (!gplok && warn) {
-		printk(KERN_WARNING "Symbol %s is being used "
-		       "by a non-GPL module, which will not "
-		       "be allowed in the future\n", name);
-		printk(KERN_WARNING "Please see the file "
-		       "Documentation/feature-removal-schedule.txt "
-		       "in the kernel source tree for more details.\n");
-	}
-	return true;
-}
-
 struct symsearch {
 	const struct kernel_symbol *start, *stop;
 	const unsigned long *crcs;
-	bool (*check)(bool gplok, bool warn, const char *name);
+	enum {
+		NOT_GPL_ONLY,
+		GPL_ONLY,
+		WILL_BE_GPL_ONLY,
+	} licence;
+	bool unused;
 };
 
-/* Look through this array of symbol tables for a symbol match which
- * passes the check function. */
-static const struct kernel_symbol *search_symarrays(const struct symsearch *arr,
-						    unsigned int num,
-						    const char *name,
-						    bool gplok,
-						    bool warn,
-						    const unsigned long **crc)
+static bool each_symbol_in_section(const struct symsearch *arr,
+				   unsigned int arrsize,
+				   struct module *owner,
+				   bool (*fn)(const struct symsearch *syms,
+					      struct module *owner,
+					      unsigned int symnum, void *data),
+				   void *data)
 {
-	unsigned int i;
-	const struct kernel_symbol *ks;
-
-	for (i = 0; i < num; i++) {
-		ks = lookup_symbol(name, arr[i].start, arr[i].stop);
-		if (!ks || !arr[i].check(gplok, warn, name))
-			continue;
+	unsigned int i, j;
 
-		if (crc)
-			*crc = symversion(arr[i].crcs, ks - arr[i].start);
-		return ks;
+	for (j = 0; j < arrsize; j++) {
+		for (i = 0; i < arr[j].stop - arr[j].start; i++)
+			if (fn(&arr[j], owner, i, data))
+				return true;
 	}
-	return NULL;
+
+	return false;
 }
 
-/* Find a symbol, return value, (optional) crc and (optional) module
- * which owns it */
-static unsigned long find_symbol(const char *name,
-				 struct module **owner,
-				 const unsigned long **crc,
-				 bool gplok,
-				 bool warn)
+/* Returns true as soon as fn returns true, otherwise false. */
+static bool each_symbol(bool (*fn)(const struct symsearch *arr,
+				   struct module *owner,
+				   unsigned int symnum, void *data),
+			void *data)
 {
 	struct module *mod;
-	const struct kernel_symbol *ks;
 	const struct symsearch arr[] = {
 		{ __start___ksymtab, __stop___ksymtab, __start___kcrctab,
-		  always_ok },
+		  NOT_GPL_ONLY, false },
 		{ __start___ksymtab_gpl, __stop___ksymtab_gpl,
-		  __start___kcrctab_gpl, gpl_only },
+		  __start___kcrctab_gpl,
+		  GPL_ONLY, false },
 		{ __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
-		  __start___kcrctab_gpl_future, warn_if_not_gpl },
+		  __start___kcrctab_gpl_future,
+		  WILL_BE_GPL_ONLY, false },
 		{ __start___ksymtab_unused, __stop___ksymtab_unused,
-		  __start___kcrctab_unused, printk_unused_warning },
+		  __start___kcrctab_unused,
+		  NOT_GPL_ONLY, true },
 		{ __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
-		  __start___kcrctab_unused_gpl, gpl_only_unused_warning },
+		  __start___kcrctab_unused_gpl,
+		  GPL_ONLY, true },
 	};
 
-	/* Core kernel first. */
-	ks = search_symarrays(arr, ARRAY_SIZE(arr), name, gplok, warn, crc);
-	if (ks) {
-		if (owner)
-			*owner = NULL;
-		return ks->value;
-	}
+	if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
+		return true;
 
-	/* Now try modules. */
 	list_for_each_entry(mod, &modules, list) {
 		struct symsearch arr[] = {
 			{ mod->syms, mod->syms + mod->num_syms, mod->crcs,
-			  always_ok },
+			  NOT_GPL_ONLY, false },
 			{ mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
-			  mod->gpl_crcs, gpl_only },
+			  mod->gpl_crcs,
+			  GPL_ONLY, false },
 			{ mod->gpl_future_syms,
 			  mod->gpl_future_syms + mod->num_gpl_future_syms,
-			  mod->gpl_future_crcs, warn_if_not_gpl },
+			  mod->gpl_future_crcs,
+			  WILL_BE_GPL_ONLY, false },
 			{ mod->unused_syms,
 			  mod->unused_syms + mod->num_unused_syms,
-			  mod->unused_crcs, printk_unused_warning },
+			  mod->unused_crcs,
+			  NOT_GPL_ONLY, true },
 			{ mod->unused_gpl_syms,
 			  mod->unused_gpl_syms + mod->num_unused_gpl_syms,
-			  mod->unused_gpl_crcs, gpl_only_unused_warning },
+			  mod->unused_gpl_crcs,
+			  GPL_ONLY, true },
 		};
 
-		ks = search_symarrays(arr, ARRAY_SIZE(arr),
-				      name, gplok, warn, crc);
-		if (ks) {
-			if (owner)
-				*owner = mod;
-			return ks->value;
+		if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
+			return true;
+	}
+	return false;
+}
+
+struct find_symbol_arg {
+	/* Input */
+	const char *name;
+	bool gplok;
+	bool warn;
+
+	/* Output */
+	struct module *owner;
+	const unsigned long *crc;
+	unsigned long value;
+};
+
+static bool find_symbol_in_section(const struct symsearch *syms,
+				   struct module *owner,
+				   unsigned int symnum, void *data)
+{
+	struct find_symbol_arg *fsa = data;
+
+	if (strcmp(syms->start[symnum].name, fsa->name) != 0)
+		return false;
+
+	if (!fsa->gplok) {
+		if (syms->licence == GPL_ONLY)
+			return false;
+		if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
+			printk(KERN_WARNING "Symbol %s is being used "
+			       "by a non-GPL module, which will not "
+			       "be allowed in the future\n", fsa->name);
+			printk(KERN_WARNING "Please see the file "
+			       "Documentation/feature-removal-schedule.txt "
+			       "in the kernel source tree for more details.\n");
 		}
 	}
 
+	if (syms->unused && fsa->warn) {
+		printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
+		       "however this module is using it.\n", fsa->name);
+		printk(KERN_WARNING
+		       "This symbol will go away in the future.\n");
+		printk(KERN_WARNING
+		       "Please evalute if this is the right api to use and if "
+		       "it really is, submit a report the linux kernel "
+		       "mailinglist together with submitting your code for "
+		       "inclusion.\n");
+	}
+
+	fsa->owner = owner;
+	fsa->crc = symversion(syms->crcs, symnum);
+	fsa->value = syms->start[symnum].value;
+	return true;
+}
+
+/* Find a symbol, return value, (optional) crc and (optional) module
+ * which owns it */
+static unsigned long find_symbol(const char *name,
+				 struct module **owner,
+				 const unsigned long **crc,
+				 bool gplok,
+				 bool warn)
+{
+	struct find_symbol_arg fsa;
+
+	fsa.name = name;
+	fsa.gplok = gplok;
+	fsa.warn = warn;
+
+	if (each_symbol(find_symbol_in_section, &fsa)) {
+		if (owner)
+			*owner = fsa.owner;
+		if (crc)
+			*crc = fsa.crc;
+		return fsa.value;
+	}
+
 	DEBUGP("Failed to find symbol %s\n", name);
 	return -ENOENT;
 }
 
+/* lookup symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_symbol(const char *name,
+	const struct kernel_symbol *start,
+	const struct kernel_symbol *stop)
+{
+	const struct kernel_symbol *ks = start;
+	for (; ks < stop; ks++)
+		if (strcmp(ks->name, name) == 0)
+			return ks;
+	return NULL;
+}
+
 /* Search for module by name: must hold module_mutex. */
 static struct module *find_module(const char *name)
 {
-- 
cgit v1.1


From f7f5b67557eac1131ba6532522e3c50eced34238 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 22 Jul 2008 19:24:26 -0500
Subject: Shrink struct module: CONFIG_UNUSED_SYMBOLS ifdefs

module.c and module.h conatains code for finding
exported symbols which are declared with EXPORT_UNUSED_SYMBOL,
and this code is compiled in even if CONFIG_UNUSED_SYMBOLS is not set
and thus there can be no EXPORT_UNUSED_SYMBOLs in modules anyway
(because EXPORT_UNUSED_SYMBOL(x) are compiled out to nothing then).

This patch adds required #ifdefs.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 49 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index c51c089..ea95805 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -134,17 +134,19 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
 extern const struct kernel_symbol __stop___ksymtab_gpl[];
 extern const struct kernel_symbol __start___ksymtab_gpl_future[];
 extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
-extern const struct kernel_symbol __start___ksymtab_unused[];
-extern const struct kernel_symbol __stop___ksymtab_unused[];
-extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
-extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
 extern const struct kernel_symbol __start___ksymtab_gpl_future[];
 extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
 extern const unsigned long __start___kcrctab[];
 extern const unsigned long __start___kcrctab_gpl[];
 extern const unsigned long __start___kcrctab_gpl_future[];
+#ifdef CONFIG_UNUSED_SYMBOLS
+extern const struct kernel_symbol __start___ksymtab_unused[];
+extern const struct kernel_symbol __stop___ksymtab_unused[];
+extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
+extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
 extern const unsigned long __start___kcrctab_unused[];
 extern const unsigned long __start___kcrctab_unused_gpl[];
+#endif
 
 #ifndef CONFIG_MODVERSIONS
 #define symversion(base, idx) NULL
@@ -198,12 +200,14 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
 		{ __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
 		  __start___kcrctab_gpl_future,
 		  WILL_BE_GPL_ONLY, false },
+#ifdef CONFIG_UNUSED_SYMBOLS
 		{ __start___ksymtab_unused, __stop___ksymtab_unused,
 		  __start___kcrctab_unused,
 		  NOT_GPL_ONLY, true },
 		{ __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
 		  __start___kcrctab_unused_gpl,
 		  GPL_ONLY, true },
+#endif
 	};
 
 	if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
@@ -220,6 +224,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
 			  mod->gpl_future_syms + mod->num_gpl_future_syms,
 			  mod->gpl_future_crcs,
 			  WILL_BE_GPL_ONLY, false },
+#ifdef CONFIG_UNUSED_SYMBOLS
 			{ mod->unused_syms,
 			  mod->unused_syms + mod->num_unused_syms,
 			  mod->unused_crcs,
@@ -228,6 +233,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
 			  mod->unused_gpl_syms + mod->num_unused_gpl_syms,
 			  mod->unused_gpl_crcs,
 			  GPL_ONLY, true },
+#endif
 		};
 
 		if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
@@ -270,6 +276,7 @@ static bool find_symbol_in_section(const struct symsearch *syms,
 		}
 	}
 
+#ifdef CONFIG_UNUSED_SYMBOLS
 	if (syms->unused && fsa->warn) {
 		printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
 		       "however this module is using it.\n", fsa->name);
@@ -281,6 +288,7 @@ static bool find_symbol_in_section(const struct symsearch *syms,
 		       "mailinglist together with submitting your code for "
 		       "inclusion.\n");
 	}
+#endif
 
 	fsa->owner = owner;
 	fsa->crc = symversion(syms->crcs, symnum);
@@ -1476,8 +1484,10 @@ static int verify_export_symbols(struct module *mod)
 		{ mod->syms, mod->num_syms },
 		{ mod->gpl_syms, mod->num_gpl_syms },
 		{ mod->gpl_future_syms, mod->num_gpl_future_syms },
+#ifdef CONFIG_UNUSED_SYMBOLS
 		{ mod->unused_syms, mod->num_unused_syms },
 		{ mod->unused_gpl_syms, mod->num_unused_gpl_syms },
+#endif
 	};
 
 	for (i = 0; i < ARRAY_SIZE(arr); i++) {
@@ -1795,10 +1805,12 @@ static struct module *load_module(void __user *umod,
 	unsigned int gplfutureindex;
 	unsigned int gplfuturecrcindex;
 	unsigned int unwindex = 0;
+#ifdef CONFIG_UNUSED_SYMBOLS
 	unsigned int unusedindex;
 	unsigned int unusedcrcindex;
 	unsigned int unusedgplindex;
 	unsigned int unusedgplcrcindex;
+#endif
 	unsigned int markersindex;
 	unsigned int markersstringsindex;
 	struct module *mod;
@@ -1881,13 +1893,15 @@ static struct module *load_module(void __user *umod,
 	exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
 	gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
 	gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
-	unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
-	unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
 	crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
 	gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
 	gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
+#ifdef CONFIG_UNUSED_SYMBOLS
+	unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
+	unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
 	unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused");
 	unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl");
+#endif
 	setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
 	exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
 	obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
@@ -2049,14 +2063,15 @@ static struct module *load_module(void __user *umod,
 		mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
 	mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
 					sizeof(*mod->gpl_future_syms);
-	mod->num_unused_syms = sechdrs[unusedindex].sh_size /
-					sizeof(*mod->unused_syms);
-	mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
-					sizeof(*mod->unused_gpl_syms);
 	mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
 	if (gplfuturecrcindex)
 		mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
 
+#ifdef CONFIG_UNUSED_SYMBOLS
+	mod->num_unused_syms = sechdrs[unusedindex].sh_size /
+					sizeof(*mod->unused_syms);
+	mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
+					sizeof(*mod->unused_gpl_syms);
 	mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr;
 	if (unusedcrcindex)
 		mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr;
@@ -2064,13 +2079,17 @@ static struct module *load_module(void __user *umod,
 	if (unusedgplcrcindex)
 		mod->unused_gpl_crcs
 			= (void *)sechdrs[unusedgplcrcindex].sh_addr;
+#endif
 
 #ifdef CONFIG_MODVERSIONS
-	if ((mod->num_syms && !crcindex) ||
-	    (mod->num_gpl_syms && !gplcrcindex) ||
-	    (mod->num_gpl_future_syms && !gplfuturecrcindex) ||
-	    (mod->num_unused_syms && !unusedcrcindex) ||
-	    (mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
+	if ((mod->num_syms && !crcindex)
+	    || (mod->num_gpl_syms && !gplcrcindex)
+	    || (mod->num_gpl_future_syms && !gplfuturecrcindex)
+#ifdef CONFIG_UNUSED_SYMBOLS
+	    || (mod->num_unused_syms && !unusedcrcindex)
+	    || (mod->num_unused_gpl_syms && !unusedgplcrcindex)
+#endif
+		) {
 		printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
 		err = try_to_force_load(mod, "nocrc");
 		if (err)
-- 
cgit v1.1


From 2f0f2a334bc38b61a9afca951185cd3844ee709d Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 22 Jul 2008 19:24:27 -0500
Subject: module: turn longs into ints for module sizes

This shrinks module.o and each *.ko file.

And finally, structure members which hold length of module
code (four such members there) and count of symbols
are converted from longs to ints.

We cannot possibly have a module where 32 bits won't
be enough to hold such counts.

For one, module loading checks module size for sanity
before loading, so such insanely big module will fail
that test first.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index ea95805..5c7eb069 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1567,7 +1567,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
 }
 
 /* Update size with this section: return offset. */
-static long get_offset(unsigned long *size, Elf_Shdr *sechdr)
+static long get_offset(unsigned int *size, Elf_Shdr *sechdr)
 {
 	long ret;
 
@@ -2562,7 +2562,7 @@ static int m_show(struct seq_file *m, void *p)
 	struct module *mod = list_entry(p, struct module, list);
 	char buf[8];
 
-	seq_printf(m, "%s %lu",
+	seq_printf(m, "%s %u",
 		   mod->name, mod->init_size + mod->core_size);
 	print_unload_info(m, mod);
 
-- 
cgit v1.1


From 3a642e99babe0617febb6f402e1e063479f489db Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 22 Jul 2008 19:24:28 -0500
Subject: modules: Take a shortcut for checking if an address is in a module

This patch keeps track of the boundaries of module allocation, in
order to speed up module_text_address().

Inspired by Arjan's version, which required arch-specific defines:

	Various pieces of the kernel (lockdep, latencytop, etc) tend
	to store backtraces, sometimes at a relatively high
	frequency. In itself this isn't a big performance deal (after
	all you're using diagnostics features), but there have been
	some complaints from people who have over 100 modules loaded
	that this is a tad too slow.

	This is due to the new backtracer code which looks at every
	slot on the stack to see if it's a kernel/module text address,
	so that's 1024 slots.  1024 times 100 modules... that's a lot
	of list walking.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5c7eb069..d8b5605 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -70,6 +70,9 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
 
 static BLOCKING_NOTIFIER_HEAD(module_notify_list);
 
+/* Bounds of module allocation, for speeding __module_text_address */
+static unsigned long module_addr_min = -1UL, module_addr_max = 0;
+
 int register_module_notifier(struct notifier_block * nb)
 {
 	return blocking_notifier_chain_register(&module_notify_list, nb);
@@ -1779,6 +1782,20 @@ static inline void add_kallsyms(struct module *mod,
 }
 #endif /* CONFIG_KALLSYMS */
 
+static void *module_alloc_update_bounds(unsigned long size)
+{
+	void *ret = module_alloc(size);
+
+	if (ret) {
+		/* Update module bounds. */
+		if ((unsigned long)ret < module_addr_min)
+			module_addr_min = (unsigned long)ret;
+		if ((unsigned long)ret + size > module_addr_max)
+			module_addr_max = (unsigned long)ret + size;
+	}
+	return ret;
+}
+
 /* Allocate and load the module: note that size of section 0 is always
    zero, and we rely on this for optional sections. */
 static struct module *load_module(void __user *umod,
@@ -1980,7 +1997,7 @@ static struct module *load_module(void __user *umod,
 	layout_sections(mod, hdr, sechdrs, secstrings);
 
 	/* Do the allocs. */
-	ptr = module_alloc(mod->core_size);
+	ptr = module_alloc_update_bounds(mod->core_size);
 	if (!ptr) {
 		err = -ENOMEM;
 		goto free_percpu;
@@ -1988,7 +2005,7 @@ static struct module *load_module(void __user *umod,
 	memset(ptr, 0, mod->core_size);
 	mod->module_core = ptr;
 
-	ptr = module_alloc(mod->init_size);
+	ptr = module_alloc_update_bounds(mod->init_size);
 	if (!ptr && mod->init_size) {
 		err = -ENOMEM;
 		goto free_core;
@@ -2645,6 +2662,9 @@ struct module *__module_text_address(unsigned long addr)
 {
 	struct module *mod;
 
+	if (addr < module_addr_min || addr > module_addr_max)
+		return NULL;
+
 	list_for_each_entry(mod, &modules, list)
 		if (within(addr, mod->module_init, mod->init_text_size)
 		    || within(addr, mod->module_core, mod->core_text_size))
-- 
cgit v1.1


From a1ef5adb4cad43460ebba23c5a78cf4a55bb6a5b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 8 Jul 2008 19:00:17 +0200
Subject: remove CONFIG_KMOD from core kernel code

Always compile request_module when the kernel allows modules.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/exec_domain.c | 2 +-
 kernel/kmod.c        | 2 +-
 kernel/sysctl.c      | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index a9e6bad..c1ef192 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -65,7 +65,7 @@ lookup_exec_domain(u_long personality)
 				goto out;
 	}
 
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 	read_unlock(&exec_domains_lock);
 	request_module("personality-%ld", pers);
 	read_lock(&exec_domains_lock);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8df97d3..90d7af1 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -42,7 +42,7 @@ extern int max_threads;
 
 static struct workqueue_struct *khelper_wq;
 
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 
 /*
 	modprobe_path is set via /proc/sys.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6b16e16..b859e6b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -110,7 +110,7 @@ static int min_percpu_pagelist_fract = 8;
 
 static int ngroups_max = NGROUPS_MAX;
 
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 extern char modprobe_path[];
 #endif
 #ifdef CONFIG_CHR_DEV_SG
@@ -475,7 +475,7 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &ftrace_enable_sysctl,
 	},
 #endif
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 	{
 		.ctl_name	= KERN_MODPROBE,
 		.procname	= "modprobe",
-- 
cgit v1.1


From 91cd4d6ef0abb1f65e81f8fe37e7d3c10344e38c Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 21 Jul 2008 14:21:35 -0700
Subject: cpusets: fix wrong domain attr updates

Fix wrong domain attr updates, or we will always update the first sched
domain attr.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>	[2.6.26.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 459d601..d2cc67d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -679,7 +679,9 @@ restart:
 				if (apn == b->pn) {
 					cpus_or(*dp, *dp, b->cpus_allowed);
 					b->pn = -1;
-					update_domain_attr(dattr, b);
+					if (dattr)
+						update_domain_attr(dattr
+								   + nslot, b);
 				}
 			}
 			nslot++;
-- 
cgit v1.1


From 5f17156fc55abac476d180e480bedb0f07f01b14 Mon Sep 17 00:00:00 2001
From: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Date: Mon, 21 Jul 2008 14:21:37 -0700
Subject: Fix build on COMPAT platforms when CONFIG_EPOLL is disabled

Add missing cond_syscall() entry for compat_sys_epoll_pwait.

Signed-off-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: <stable@kernel.org>		[2.6.25.x, 2.6.26.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys_ni.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5b9b467..0fea0ee 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -59,6 +59,7 @@ cond_syscall(sys_epoll_create);
 cond_syscall(sys_epoll_ctl);
 cond_syscall(sys_epoll_wait);
 cond_syscall(sys_epoll_pwait);
+cond_syscall(compat_sys_epoll_pwait);
 cond_syscall(sys_semget);
 cond_syscall(sys_semop);
 cond_syscall(sys_semtimedop);
-- 
cgit v1.1


From 422037bafde8083acc3c539ceba3dfc60a04110c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 23 Jul 2008 11:16:38 +0200
Subject: sched: fix hrtick & generic-ipi dependency

Andrew Morton reported this s390 allmodconfig build failure:

 kernel/built-in.o: In function `hrtick_start_fair':
 sched.c:(.text+0x69c6): undefined reference to `__smp_call_function_single'

the reason is that s390 is not a generic-ipi SMP platform yet, while
the hrtick code relies on it. Fix the dependency.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Kconfig.hz | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 2a202a8..382dd5a 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
 	default 1000 if HZ_1000
 
 config SCHED_HRTICK
-	def_bool HIGH_RES_TIMERS
+	def_bool HIGH_RES_TIMERS && USE_GENERIC_SMP_HELPERS
-- 
cgit v1.1


From 2db873211ba47ef704c301f9ecf4a33413a0b649 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <Uwe.Kleine-Koenig@digi.com>
Date: Wed, 23 Jul 2008 14:42:25 +0200
Subject: set_irq_wake: fix return code and wake status tracking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since 15a647eba94c3da27ccc666bea72e7cca06b2d19 set_irq_wake returned -ENXIO
if another device had it already enabled.  Zero is the right value to
return in this case.  Moreover the change to desc->status was not reverted
if desc->chip->set_wake returned an error.

Signed-off-by: Uwe Kleine-König <Uwe.Kleine-Koenig@digi.com>
Acked-by: David Brownell <dbrownell@users.sourceforge.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/manage.c | 39 +++++++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 77a51be..3cfc0fe 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -217,6 +217,17 @@ void enable_irq(unsigned int irq)
 }
 EXPORT_SYMBOL(enable_irq);
 
+int set_irq_wake_real(unsigned int irq, unsigned int on)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	int ret = -ENXIO;
+
+	if (desc->chip->set_wake)
+		ret = desc->chip->set_wake(irq, on);
+
+	return ret;
+}
+
 /**
  *	set_irq_wake - control irq power management wakeup
  *	@irq:	interrupt to control
@@ -233,30 +244,34 @@ int set_irq_wake(unsigned int irq, unsigned int on)
 {
 	struct irq_desc *desc = irq_desc + irq;
 	unsigned long flags;
-	int ret = -ENXIO;
-	int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake;
+	int ret = 0;
 
 	/* wakeup-capable irqs can be shared between drivers that
 	 * don't need to have the same sleep mode behaviors.
 	 */
 	spin_lock_irqsave(&desc->lock, flags);
 	if (on) {
-		if (desc->wake_depth++ == 0)
-			desc->status |= IRQ_WAKEUP;
-		else
-			set_wake = NULL;
+		if (desc->wake_depth++ == 0) {
+			ret = set_irq_wake_real(irq, on);
+			if (ret)
+				desc->wake_depth = 0;
+			else
+				desc->status |= IRQ_WAKEUP;
+		}
 	} else {
 		if (desc->wake_depth == 0) {
 			printk(KERN_WARNING "Unbalanced IRQ %d "
 					"wake disable\n", irq);
 			WARN_ON(1);
-		} else if (--desc->wake_depth == 0)
-			desc->status &= ~IRQ_WAKEUP;
-		else
-			set_wake = NULL;
+		} else if (--desc->wake_depth == 0) {
+			ret = set_irq_wake_real(irq, on);
+			if (ret)
+				desc->wake_depth = 1;
+			else
+				desc->status &= ~IRQ_WAKEUP;
+		}
 	}
-	if (set_wake)
-		ret = desc->chip->set_wake(irq, on);
+
 	spin_unlock_irqrestore(&desc->lock, flags);
 	return ret;
 }
-- 
cgit v1.1


From 86a1c34a929f30fde8ad01ea8245df61ddcf58b7 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Mon, 23 Jun 2008 15:37:04 -0700
Subject: x86_64 syscall audit fast-path

This adds a fast path for 64-bit syscall entry and exit when
TIF_SYSCALL_AUDIT is set, but no other kind of syscall tracing.
This path does not need to save and restore all registers as
the general case of tracing does.  Avoiding the iret return path
when syscall audit is enabled helps performance a lot.

Signed-off-by: Roland McGrath <roland@redhat.com>
---
 kernel/auditsc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index c10e7aa..4699950 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1476,7 +1476,8 @@ void audit_syscall_entry(int arch, int major,
 	struct audit_context *context = tsk->audit_context;
 	enum audit_state     state;
 
-	BUG_ON(!context);
+	if (unlikely(!context))
+		return;
 
 	/*
 	 * This happens only on certain architectures that make system
-- 
cgit v1.1


From 1986b0cb1671ea39178b4e2b00461109728fc935 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 24 Jul 2008 08:10:02 +0200
Subject: ftrace: remove latency-tracer leftover

remove the :vim=ft=help tag from trace files.

I used them years ago to syntax-highlight traces and forgot about this hack.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 868e121..fc20e09 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1203,9 +1203,6 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
 
 	iter->pos = *pos;
 
-	if (last_ent && !ent)
-		seq_puts(m, "\n\nvim:ft=help\n");
-
 	return ent;
 }
 
-- 
cgit v1.1


From 58838cf3ca3337d76141c33d6c68376490263468 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 24 Jul 2008 12:43:13 +0200
Subject: sched: clean up compiler warning

Reported-by: Daniel Walker <dwalker@mvista.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 147004c..93ac8ee 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -253,7 +253,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
 
 		diff = iter->rt_runtime - iter->rt_time;
 		if (diff > 0) {
-			do_div(diff, weight);
+			diff = div_u64((u64)diff, weight);
 			if (rt_rq->rt_runtime + diff > rt_period)
 				diff = rt_period - rt_rq->rt_runtime;
 			iter->rt_runtime -= diff;
-- 
cgit v1.1


From c748e1340e0de3fa7fed86f8bdf499be9242afff Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 23 Jul 2008 21:27:03 -0700
Subject: mm/vmstat.c: proper externs

This patch adds proper extern declarations for five variables in
include/linux/vmstat.h

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2a7b9d8..1f7b3b7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -43,6 +43,7 @@
 #include <linux/limits.h>
 #include <linux/dcache.h>
 #include <linux/syscalls.h>
+#include <linux/vmstat.h>
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
 #include <linux/reboot.h>
@@ -80,7 +81,6 @@ extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int maps_protect;
-extern int sysctl_stat_interval;
 extern int latencytop_enabled;
 extern int sysctl_nr_open_min, sysctl_nr_open_max;
 #ifdef CONFIG_RCU_TORTURE_TEST
-- 
cgit v1.1


From a1e78772d72b2616ed20e54896e68e0e7044854e Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Wed, 23 Jul 2008 21:27:23 -0700
Subject: hugetlb: reserve huge pages for reliable MAP_PRIVATE hugetlbfs
 mappings until fork()

This patch reserves huge pages at mmap() time for MAP_PRIVATE mappings in
a similar manner to the reservations taken for MAP_SHARED mappings.  The
reserve count is accounted both globally and on a per-VMA basis for
private mappings.  This guarantees that a process that successfully calls
mmap() will successfully fault all pages in the future unless fork() is
called.

The characteristics of private mappings of hugetlbfs files behaviour after
this patch are;

1. The process calling mmap() is guaranteed to succeed all future faults until
   it forks().
2. On fork(), the parent may die due to SIGKILL on writes to the private
   mapping if enough pages are not available for the COW. For reasonably
   reliable behaviour in the face of a small huge page pool, children of
   hugepage-aware processes should not reference the mappings; such as
   might occur when fork()ing to exec().
3. On fork(), the child VMAs inherit no reserves. Reads on pages already
   faulted by the parent will succeed. Successful writes will depend on enough
   huge pages being free in the pool.
4. Quotas of the hugetlbfs mount are checked at reserve time for the mapper
   and at fault time otherwise.

Before this patch, all reads or writes in the child potentially needs page
allocations that can later lead to the death of the parent.  This applies
to reads and writes of uninstantiated pages as well as COW.  After the
patch it is only a write to an instantiated page that causes problems.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Adam Litke <agl@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index adefc11..552c8d8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -33,6 +33,7 @@
 #include <linux/cpu.h>
 #include <linux/cgroup.h>
 #include <linux/security.h>
+#include <linux/hugetlb.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
@@ -307,6 +308,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		}
 
 		/*
+		 * Clear hugetlb-related page reserves for children. This only
+		 * affects MAP_PRIVATE mappings. Faults generated by the child
+		 * are not guaranteed to succeed, even if read-only
+		 */
+		if (is_vm_hugetlb_page(tmp))
+			reset_vma_resv_huge_pages(tmp);
+
+		/*
 		 * Link in the new vma and copy the page table entries.
 		 */
 		*pprev = tmp;
-- 
cgit v1.1


From e5ff215941d59f8ae6bf58f6428dc5c26745a612 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 23 Jul 2008 21:27:42 -0700
Subject: hugetlb: multiple hstates for multiple page sizes

Add basic support for more than one hstate in hugetlbfs.  This is the key
to supporting multiple hugetlbfs page sizes at once.

- Rather than a single hstate, we now have an array, with an iterator
- default_hstate continues to be the struct hstate which we use by default
- Add functions for architectures to register new hstates

[akpm@linux-foundation.org: coding-style fixes]
Acked-by: Adam Litke <agl@us.ibm.com>
Acked-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1f7b3b7..1a8299d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -959,7 +959,7 @@ static struct ctl_table vm_table[] = {
 #ifdef CONFIG_HUGETLB_PAGE
 	 {
 		.procname	= "nr_hugepages",
-		.data		= &max_huge_pages,
+		.data		= NULL,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= &hugetlb_sysctl_handler,
@@ -985,10 +985,12 @@ static struct ctl_table vm_table[] = {
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "nr_overcommit_hugepages",
-		.data		= &sysctl_overcommit_huge_pages,
-		.maxlen		= sizeof(sysctl_overcommit_huge_pages),
+		.data		= NULL,
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= &hugetlb_overcommit_handler,
+		.extra1		= (void *)&hugetlb_zero,
+		.extra2		= (void *)&hugetlb_infinity,
 	},
 #endif
 	{
-- 
cgit v1.1


From ab763c7112ce0e2559c73f921617c81dc7287ca6 Mon Sep 17 00:00:00 2001
From: "Andrew G. Morgan" <morgan@kernel.org>
Date: Wed, 23 Jul 2008 21:28:25 -0700
Subject: security: filesystem capabilities refactor kernel code

To date, we've tried hard to confine filesystem support for capabilities
to the security modules.  This has left a lot of the code in
kernel/capability.c in a state where it looks like it supports something
that filesystem support for capabilities actually suppresses when the LSM
security/commmoncap.c code runs.  What is left is a lot of code that uses
sub-optimal locking in the main kernel

With this change we refactor the main kernel code and make it explicit
which locks are needed and that the only remaining kernel races in this
area are associated with non-filesystem capability code.

Signed-off-by: Andrew G. Morgan <morgan@kernel.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/capability.c | 338 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 221 insertions(+), 117 deletions(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index 901e0fd..0101e84 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -115,11 +115,208 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
 	return 0;
 }
 
+#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
+
+/*
+ * Without filesystem capability support, we nominally support one process
+ * setting the capabilities of another
+ */
+static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
+				     kernel_cap_t *pIp, kernel_cap_t *pPp)
+{
+	struct task_struct *target;
+	int ret;
+
+	spin_lock(&task_capability_lock);
+	read_lock(&tasklist_lock);
+
+	if (pid && pid != task_pid_vnr(current)) {
+		target = find_task_by_vpid(pid);
+		if (!target) {
+			ret = -ESRCH;
+			goto out;
+		}
+	} else
+		target = current;
+
+	ret = security_capget(target, pEp, pIp, pPp);
+
+out:
+	read_unlock(&tasklist_lock);
+	spin_unlock(&task_capability_lock);
+
+	return ret;
+}
+
+/*
+ * cap_set_pg - set capabilities for all processes in a given process
+ * group.  We call this holding task_capability_lock and tasklist_lock.
+ */
+static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
+			     kernel_cap_t *inheritable,
+			     kernel_cap_t *permitted)
+{
+	struct task_struct *g, *target;
+	int ret = -EPERM;
+	int found = 0;
+	struct pid *pgrp;
+
+	spin_lock(&task_capability_lock);
+	read_lock(&tasklist_lock);
+
+	pgrp = find_vpid(pgrp_nr);
+	do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
+		target = g;
+		while_each_thread(g, target) {
+			if (!security_capset_check(target, effective,
+						   inheritable, permitted)) {
+				security_capset_set(target, effective,
+						    inheritable, permitted);
+				ret = 0;
+			}
+			found = 1;
+		}
+	} while_each_pid_task(pgrp, PIDTYPE_PGID, g);
+
+	read_unlock(&tasklist_lock);
+	spin_unlock(&task_capability_lock);
+
+	if (!found)
+		ret = 0;
+	return ret;
+}
+
 /*
- * For sys_getproccap() and sys_setproccap(), any of the three
- * capability set pointers may be NULL -- indicating that that set is
- * uninteresting and/or not to be changed.
+ * cap_set_all - set capabilities for all processes other than init
+ * and self.  We call this holding task_capability_lock and tasklist_lock.
  */
+static inline int cap_set_all(kernel_cap_t *effective,
+			      kernel_cap_t *inheritable,
+			      kernel_cap_t *permitted)
+{
+	struct task_struct *g, *target;
+	int ret = -EPERM;
+	int found = 0;
+
+	spin_lock(&task_capability_lock);
+	read_lock(&tasklist_lock);
+
+	do_each_thread(g, target) {
+		if (target == current
+		    || is_container_init(target->group_leader))
+			continue;
+		found = 1;
+		if (security_capset_check(target, effective, inheritable,
+					  permitted))
+			continue;
+		ret = 0;
+		security_capset_set(target, effective, inheritable, permitted);
+	} while_each_thread(g, target);
+
+	read_unlock(&tasklist_lock);
+	spin_unlock(&task_capability_lock);
+
+	if (!found)
+		ret = 0;
+
+	return ret;
+}
+
+/*
+ * Given the target pid does not refer to the current process we
+ * need more elaborate support... (This support is not present when
+ * filesystem capabilities are configured.)
+ */
+static inline int do_sys_capset_other_tasks(pid_t pid, kernel_cap_t *effective,
+					    kernel_cap_t *inheritable,
+					    kernel_cap_t *permitted)
+{
+	struct task_struct *target;
+	int ret;
+
+	if (!capable(CAP_SETPCAP))
+		return -EPERM;
+
+	if (pid == -1)	          /* all procs other than current and init */
+		return cap_set_all(effective, inheritable, permitted);
+
+	else if (pid < 0)                    /* all procs in process group */
+		return cap_set_pg(-pid, effective, inheritable, permitted);
+
+	/* target != current */
+	spin_lock(&task_capability_lock);
+	read_lock(&tasklist_lock);
+
+	target = find_task_by_vpid(pid);
+	if (!target)
+		ret = -ESRCH;
+	else {
+		ret = security_capset_check(target, effective, inheritable,
+					    permitted);
+
+		/* having verified that the proposed changes are legal,
+		   we now put them into effect. */
+		if (!ret)
+			security_capset_set(target, effective, inheritable,
+					    permitted);
+	}
+
+	read_unlock(&tasklist_lock);
+	spin_unlock(&task_capability_lock);
+
+	return ret;
+}
+
+#else /* ie., def CONFIG_SECURITY_FILE_CAPABILITIES */
+
+/*
+ * If we have configured with filesystem capability support, then the
+ * only thing that can change the capabilities of the current process
+ * is the current process. As such, we can't be in this code at the
+ * same time as we are in the process of setting capabilities in this
+ * process. The net result is that we can limit our use of locks to
+ * when we are reading the caps of another process.
+ */
+static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
+				     kernel_cap_t *pIp, kernel_cap_t *pPp)
+{
+	int ret;
+
+	if (pid && (pid != task_pid_vnr(current))) {
+		struct task_struct *target;
+
+		spin_lock(&task_capability_lock);
+		read_lock(&tasklist_lock);
+
+		target = find_task_by_vpid(pid);
+		if (!target)
+			ret = -ESRCH;
+		else
+			ret = security_capget(target, pEp, pIp, pPp);
+
+		read_unlock(&tasklist_lock);
+		spin_unlock(&task_capability_lock);
+	} else
+		ret = security_capget(current, pEp, pIp, pPp);
+
+	return ret;
+}
+
+/*
+ * With filesystem capability support configured, the kernel does not
+ * permit the changing of capabilities in one process by another
+ * process. (CAP_SETPCAP has much less broad semantics when configured
+ * this way.)
+ */
+static inline int do_sys_capset_other_tasks(pid_t pid,
+					    kernel_cap_t *effective,
+					    kernel_cap_t *inheritable,
+					    kernel_cap_t *permitted)
+{
+	return -EPERM;
+}
+
+#endif /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */
 
 /*
  * Atomically modify the effective capabilities returning the original
@@ -155,7 +352,6 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
 {
 	int ret = 0;
 	pid_t pid;
-	struct task_struct *target;
 	unsigned tocopy;
 	kernel_cap_t pE, pI, pP;
 
@@ -169,23 +365,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
 	if (pid < 0)
 		return -EINVAL;
 
-	spin_lock(&task_capability_lock);
-	read_lock(&tasklist_lock);
-
-	if (pid && pid != task_pid_vnr(current)) {
-		target = find_task_by_vpid(pid);
-		if (!target) {
-			ret = -ESRCH;
-			goto out;
-		}
-	} else
-		target = current;
-
-	ret = security_capget(target, &pE, &pI, &pP);
-
-out:
-	read_unlock(&tasklist_lock);
-	spin_unlock(&task_capability_lock);
+	ret = cap_get_target_pid(pid, &pE, &pI, &pP);
 
 	if (!ret) {
 		struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
@@ -216,7 +396,6 @@ out:
 		 * before modification is attempted and the application
 		 * fails.
 		 */
-
 		if (copy_to_user(dataptr, kdata, tocopy
 				 * sizeof(struct __user_cap_data_struct))) {
 			return -EFAULT;
@@ -226,70 +405,8 @@ out:
 	return ret;
 }
 
-/*
- * cap_set_pg - set capabilities for all processes in a given process
- * group.  We call this holding task_capability_lock and tasklist_lock.
- */
-static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
-			      kernel_cap_t *inheritable,
-			      kernel_cap_t *permitted)
-{
-	struct task_struct *g, *target;
-	int ret = -EPERM;
-	int found = 0;
-	struct pid *pgrp;
-
-	pgrp = find_vpid(pgrp_nr);
-	do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
-		target = g;
-		while_each_thread(g, target) {
-			if (!security_capset_check(target, effective,
-							inheritable,
-							permitted)) {
-				security_capset_set(target, effective,
-							inheritable,
-							permitted);
-				ret = 0;
-			}
-			found = 1;
-		}
-	} while_each_pid_task(pgrp, PIDTYPE_PGID, g);
-
-	if (!found)
-		ret = 0;
-	return ret;
-}
-
-/*
- * cap_set_all - set capabilities for all processes other than init
- * and self.  We call this holding task_capability_lock and tasklist_lock.
- */
-static inline int cap_set_all(kernel_cap_t *effective,
-			       kernel_cap_t *inheritable,
-			       kernel_cap_t *permitted)
-{
-     struct task_struct *g, *target;
-     int ret = -EPERM;
-     int found = 0;
-
-     do_each_thread(g, target) {
-             if (target == current || is_container_init(target->group_leader))
-                     continue;
-             found = 1;
-	     if (security_capset_check(target, effective, inheritable,
-						permitted))
-		     continue;
-	     ret = 0;
-	     security_capset_set(target, effective, inheritable, permitted);
-     } while_each_thread(g, target);
-
-     if (!found)
-	     ret = 0;
-     return ret;
-}
-
 /**
- * sys_capset - set capabilities for a process or a group of processes
+ * sys_capset - set capabilities for a process or (*) a group of processes
  * @header: pointer to struct that contains capability version and
  *	target pid data
  * @data: pointer to struct that contains the effective, permitted,
@@ -313,7 +430,6 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
 	unsigned i, tocopy;
 	kernel_cap_t inheritable, permitted, effective;
-	struct task_struct *target;
 	int ret;
 	pid_t pid;
 
@@ -324,9 +440,6 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	if (get_user(pid, &header->pid))
 		return -EFAULT;
 
-	if (pid && pid != task_pid_vnr(current) && !capable(CAP_SETPCAP))
-		return -EPERM;
-
 	if (copy_from_user(&kdata, data, tocopy
 			   * sizeof(struct __user_cap_data_struct))) {
 		return -EFAULT;
@@ -344,40 +457,31 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 		i++;
 	}
 
-	spin_lock(&task_capability_lock);
-	read_lock(&tasklist_lock);
-
-	if (pid > 0 && pid != task_pid_vnr(current)) {
-		target = find_task_by_vpid(pid);
-		if (!target) {
-			ret = -ESRCH;
-			goto out;
-		}
-	} else
-		target = current;
-
-	ret = 0;
-
-	/* having verified that the proposed changes are legal,
-	   we now put them into effect. */
-	if (pid < 0) {
-		if (pid == -1)	/* all procs other than current and init */
-			ret = cap_set_all(&effective, &inheritable, &permitted);
+	if (pid && (pid != task_pid_vnr(current)))
+		ret = do_sys_capset_other_tasks(pid, &effective, &inheritable,
+						&permitted);
+	else {
+		/*
+		 * This lock is required even when filesystem
+		 * capability support is configured - it protects the
+		 * sys_capget() call from returning incorrect data in
+		 * the case that the targeted process is not the
+		 * current one.
+		 */
+		spin_lock(&task_capability_lock);
 
-		else		/* all procs in process group */
-			ret = cap_set_pg(-pid, &effective, &inheritable,
-					 &permitted);
-	} else {
-		ret = security_capset_check(target, &effective, &inheritable,
+		ret = security_capset_check(current, &effective, &inheritable,
 					    &permitted);
+		/*
+		 * Having verified that the proposed changes are
+		 * legal, we now put them into effect.
+		 */
 		if (!ret)
-			security_capset_set(target, &effective, &inheritable,
+			security_capset_set(current, &effective, &inheritable,
 					    &permitted);
+		spin_unlock(&task_capability_lock);
 	}
 
-out:
-	read_unlock(&tasklist_lock);
-	spin_unlock(&task_capability_lock);
 
 	return ret;
 }
-- 
cgit v1.1


From 0d63081d418c73cc187c893069e0f24c4c6eecd3 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Wed, 23 Jul 2008 21:28:32 -0700
Subject: swsusp: provide users with a hint about the no_console_suspend option

Tell the user about the no_console_suspend option, so that we don't have to
tell each bug reporter personally.

[akpm@linux-foundation.org: clarify the text a little]
Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 07ad9e7..3f7a2a9 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -933,7 +933,7 @@ void suspend_console(void)
 {
 	if (!console_suspend_enabled)
 		return;
-	printk("Suspending console(s)\n");
+	printk("Suspending console(s) (use no_console_suspend to debug)\n");
 	acquire_console_sem();
 	console_suspended = 1;
 }
-- 
cgit v1.1


From 77437fd4e61f87cc94d9314baa5cbf50e3ccdf54 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Wed, 23 Jul 2008 21:28:33 -0700
Subject: pm: boot time suspend selftest

Boot-time test for system suspend states (STR or standby).  The generic
RTC framework triggers wakeup alarms, which are used to exit those states.

  - Measures some aspects of suspend time ... this uses "jiffies" until
    someone converts it to use a timebase that works properly even while
    timer IRQs are disabled.

  - Triggered by a command line parameter.  By default nothing even
    vaguely troublesome will happen, but "test_suspend=mem" will give
    you a brief STR test during system boot.  (Or you may need to use
    "test_suspend=standby" instead, if your hardware needs that.)

This isn't without problems.  It fires early enough during boot that for
example both PCMCIA and MMC stacks have misbehaved.  The workaround in
those cases was to boot without such media cards inserted.

[matthltc@us.ibm.com: fix compile failure in boot time suspend selftest]
Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Pavel Machek <pavel@suse.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/Kconfig |  11 +++
 kernel/power/main.c  | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 204 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 59dfdf1..dcd165f 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -94,6 +94,17 @@ config SUSPEND
 	  powered and thus its contents are preserved, such as the
 	  suspend-to-RAM state (e.g. the ACPI S3 state).
 
+config PM_TEST_SUSPEND
+	bool "Test suspend/resume and wakealarm during bootup"
+	depends on SUSPEND && PM_DEBUG && RTC_LIB=y
+	---help---
+	This option will let you suspend your machine during bootup, and
+	make it wake up a few seconds later using an RTC wakeup alarm.
+	Enable this with a kernel parameter like "test_suspend=mem".
+
+	You probably want to have your system's RTC driver statically
+	linked, ensuring that it's available when this test runs.
+
 config SUSPEND_FREEZER
 	bool "Enable freezer for suspend to RAM/standby" \
 		if ARCH_WANTS_FREEZER_CONTROL || BROKEN
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 3398f46..95bff23 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -132,6 +132,61 @@ static inline int suspend_test(int level) { return 0; }
 
 #ifdef CONFIG_SUSPEND
 
+#ifdef CONFIG_PM_TEST_SUSPEND
+
+/*
+ * We test the system suspend code by setting an RTC wakealarm a short
+ * time in the future, then suspending.  Suspending the devices won't
+ * normally take long ... some systems only need a few milliseconds.
+ *
+ * The time it takes is system-specific though, so when we test this
+ * during system bootup we allow a LOT of time.
+ */
+#define TEST_SUSPEND_SECONDS	5
+
+static unsigned long suspend_test_start_time;
+
+static void suspend_test_start(void)
+{
+	/* FIXME Use better timebase than "jiffies", ideally a clocksource.
+	 * What we want is a hardware counter that will work correctly even
+	 * during the irqs-are-off stages of the suspend/resume cycle...
+	 */
+	suspend_test_start_time = jiffies;
+}
+
+static void suspend_test_finish(const char *label)
+{
+	long nj = jiffies - suspend_test_start_time;
+	unsigned msec;
+
+	msec = jiffies_to_msecs(abs(nj));
+	pr_info("PM: %s took %d.%03d seconds\n", label,
+			msec / 1000, msec % 1000);
+
+	/* Warning on suspend means the RTC alarm period needs to be
+	 * larger -- the system was sooo slooowwww to suspend that the
+	 * alarm (should have) fired before the system went to sleep!
+	 *
+	 * Warning on either suspend or resume also means the system
+	 * has some performance issues.  The stack dump of a WARN_ON
+	 * is more likely to get the right attention than a printk...
+	 */
+	WARN_ON(msec > (TEST_SUSPEND_SECONDS * 1000));
+}
+
+#else
+
+static void suspend_test_start(void)
+{
+}
+
+static void suspend_test_finish(const char *label)
+{
+}
+
+#endif
+
 /* This is just an arbitrary number */
 #define FREE_PAGE_NUMBER (100)
 
@@ -266,12 +321,13 @@ int suspend_devices_and_enter(suspend_state_t state)
 			goto Close;
 	}
 	suspend_console();
+	suspend_test_start();
 	error = device_suspend(PMSG_SUSPEND);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to suspend\n");
 		goto Recover_platform;
 	}
-
+	suspend_test_finish("suspend devices");
 	if (suspend_test(TEST_DEVICES))
 		goto Recover_platform;
 
@@ -293,7 +349,9 @@ int suspend_devices_and_enter(suspend_state_t state)
 	if (suspend_ops->finish)
 		suspend_ops->finish();
  Resume_devices:
+	suspend_test_start();
 	device_resume(PMSG_RESUME);
+	suspend_test_finish("resume devices");
 	resume_console();
  Close:
 	if (suspend_ops->end)
@@ -521,3 +579,137 @@ static int __init pm_init(void)
 }
 
 core_initcall(pm_init);
+
+
+#ifdef CONFIG_PM_TEST_SUSPEND
+
+#include <linux/rtc.h>
+
+/*
+ * To test system suspend, we need a hands-off mechanism to resume the
+ * system.  RTCs wake alarms are a common self-contained mechanism.
+ */
+
+static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
+{
+	static char err_readtime[] __initdata =
+		KERN_ERR "PM: can't read %s time, err %d\n";
+	static char err_wakealarm [] __initdata =
+		KERN_ERR "PM: can't set %s wakealarm, err %d\n";
+	static char err_suspend[] __initdata =
+		KERN_ERR "PM: suspend test failed, error %d\n";
+	static char info_test[] __initdata =
+		KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
+
+	unsigned long		now;
+	struct rtc_wkalrm	alm;
+	int			status;
+
+	/* this may fail if the RTC hasn't been initialized */
+	status = rtc_read_time(rtc, &alm.time);
+	if (status < 0) {
+		printk(err_readtime, rtc->dev.bus_id, status);
+		return;
+	}
+	rtc_tm_to_time(&alm.time, &now);
+
+	memset(&alm, 0, sizeof alm);
+	rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
+	alm.enabled = true;
+
+	status = rtc_set_alarm(rtc, &alm);
+	if (status < 0) {
+		printk(err_wakealarm, rtc->dev.bus_id, status);
+		return;
+	}
+
+	if (state == PM_SUSPEND_MEM) {
+		printk(info_test, pm_states[state]);
+		status = pm_suspend(state);
+		if (status == -ENODEV)
+			state = PM_SUSPEND_STANDBY;
+	}
+	if (state == PM_SUSPEND_STANDBY) {
+		printk(info_test, pm_states[state]);
+		status = pm_suspend(state);
+	}
+	if (status < 0)
+		printk(err_suspend, status);
+}
+
+static int __init has_wakealarm(struct device *dev, void *name_ptr)
+{
+	struct rtc_device *candidate = to_rtc_device(dev);
+
+	if (!candidate->ops->set_alarm)
+		return 0;
+	if (!device_may_wakeup(candidate->dev.parent))
+		return 0;
+
+	*(char **)name_ptr = dev->bus_id;
+	return 1;
+}
+
+/*
+ * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
+ * at startup time.  They're normally disabled, for faster boot and because
+ * we can't know which states really work on this particular system.
+ */
+static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
+
+static char warn_bad_state[] __initdata =
+	KERN_WARNING "PM: can't test '%s' suspend state\n";
+
+static int __init setup_test_suspend(char *value)
+{
+	unsigned i;
+
+	/* "=mem" ==> "mem" */
+	value++;
+	for (i = 0; i < PM_SUSPEND_MAX; i++) {
+		if (!pm_states[i])
+			continue;
+		if (strcmp(pm_states[i], value) != 0)
+			continue;
+		test_state = (__force suspend_state_t) i;
+		return 0;
+	}
+	printk(warn_bad_state, value);
+	return 0;
+}
+__setup("test_suspend", setup_test_suspend);
+
+static int __init test_suspend(void)
+{
+	static char		warn_no_rtc[] __initdata =
+		KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
+
+	char			*pony = NULL;
+	struct rtc_device	*rtc = NULL;
+
+	/* PM is initialized by now; is that state testable? */
+	if (test_state == PM_SUSPEND_ON)
+		goto done;
+	if (!valid_state(test_state)) {
+		printk(warn_bad_state, pm_states[test_state]);
+		goto done;
+	}
+
+	/* RTCs have initialized by now too ... can we use one? */
+	class_find_device(rtc_class, NULL, &pony, has_wakealarm);
+	if (pony)
+		rtc = rtc_class_open(pony);
+	if (!rtc) {
+		printk(warn_no_rtc);
+		goto done;
+	}
+
+	/* go for it */
+	test_wakealarm(rtc, test_state);
+	rtc_class_close(rtc);
+done:
+	return 0;
+}
+late_initcall(test_suspend);
+
+#endif /* CONFIG_PM_TEST_SUSPEND */
-- 
cgit v1.1


From 0d83304c7e7bd3b05be90281b3a47841bc8f057a Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 23 Jul 2008 21:28:38 -0700
Subject: pm: hibernation: simplify memory bitmap

This patch simplifies the memory bitmap manipulations.

- remove the member size in struct bm_block

It is not necessary for struct bm_block to have the number of bit chunks that
can be calculated by using end_pfn and start_pfn.

- use find_next_bit() for memory_bm_next_pfn

No need to invent the bitmap library only for the memory bitmap.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/snapshot.c | 88 ++++++++++++-------------------------------------
 1 file changed, 21 insertions(+), 67 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 5f91a07..5d2ab83 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -205,8 +205,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
  *	objects.  The main list's elements are of type struct zone_bitmap
  *	and each of them corresonds to one zone.  For each zone bitmap
  *	object there is a list of objects of type struct bm_block that
- *	represent each blocks of bit chunks in which information is
- *	stored.
+ *	represent each blocks of bitmap in which information is stored.
  *
  *	struct memory_bitmap contains a pointer to the main list of zone
  *	bitmap objects, a struct bm_position used for browsing the bitmap,
@@ -224,26 +223,27 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
  *	pfns that correspond to the start and end of the represented zone.
  *
  *	struct bm_block contains a pointer to the memory page in which
- *	information is stored (in the form of a block of bit chunks
- *	of type unsigned long each).  It also contains the pfns that
- *	correspond to the start and end of the represented memory area and
- *	the number of bit chunks in the block.
+ *	information is stored (in the form of a block of bitmap)
+ *	It also contains the pfns that correspond to the start and end of
+ *	the represented memory area.
  */
 
 #define BM_END_OF_MAP	(~0UL)
 
-#define BM_CHUNKS_PER_BLOCK	(PAGE_SIZE / sizeof(long))
-#define BM_BITS_PER_CHUNK	(sizeof(long) << 3)
 #define BM_BITS_PER_BLOCK	(PAGE_SIZE << 3)
 
 struct bm_block {
 	struct bm_block *next;		/* next element of the list */
 	unsigned long start_pfn;	/* pfn represented by the first bit */
 	unsigned long end_pfn;	/* pfn represented by the last bit plus 1 */
-	unsigned int size;	/* number of bit chunks */
-	unsigned long *data;	/* chunks of bits representing pages */
+	unsigned long *data;	/* bitmap representing pages */
 };
 
+static inline unsigned long bm_block_bits(struct bm_block *bb)
+{
+	return bb->end_pfn - bb->start_pfn;
+}
+
 struct zone_bitmap {
 	struct zone_bitmap *next;	/* next element of the list */
 	unsigned long start_pfn;	/* minimal pfn in this zone */
@@ -257,7 +257,6 @@ struct zone_bitmap {
 struct bm_position {
 	struct zone_bitmap *zone_bm;
 	struct bm_block *block;
-	int chunk;
 	int bit;
 };
 
@@ -272,12 +271,6 @@ struct memory_bitmap {
 
 /* Functions that operate on memory bitmaps */
 
-static inline void memory_bm_reset_chunk(struct memory_bitmap *bm)
-{
-	bm->cur.chunk = 0;
-	bm->cur.bit = -1;
-}
-
 static void memory_bm_position_reset(struct memory_bitmap *bm)
 {
 	struct zone_bitmap *zone_bm;
@@ -285,7 +278,7 @@ static void memory_bm_position_reset(struct memory_bitmap *bm)
 	zone_bm = bm->zone_bm_list;
 	bm->cur.zone_bm = zone_bm;
 	bm->cur.block = zone_bm->bm_blocks;
-	memory_bm_reset_chunk(bm);
+	bm->cur.bit = 0;
 }
 
 static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
@@ -394,12 +387,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
 			bb->start_pfn = pfn;
 			if (nr >= BM_BITS_PER_BLOCK) {
 				pfn += BM_BITS_PER_BLOCK;
-				bb->size = BM_CHUNKS_PER_BLOCK;
 				nr -= BM_BITS_PER_BLOCK;
 			} else {
 				/* This is executed only once in the loop */
 				pfn += nr;
-				bb->size = DIV_ROUND_UP(nr, BM_BITS_PER_CHUNK);
 			}
 			bb->end_pfn = pfn;
 			bb = bb->next;
@@ -478,8 +469,8 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
 	}
 	zone_bm->cur_block = bb;
 	pfn -= bb->start_pfn;
-	*bit_nr = pfn % BM_BITS_PER_CHUNK;
-	*addr = bb->data + pfn / BM_BITS_PER_CHUNK;
+	*bit_nr = pfn;
+	*addr = bb->data;
 	return 0;
 }
 
@@ -528,36 +519,6 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
 	return test_bit(bit, addr);
 }
 
-/* Two auxiliary functions for memory_bm_next_pfn */
-
-/* Find the first set bit in the given chunk, if there is one */
-
-static inline int next_bit_in_chunk(int bit, unsigned long *chunk_p)
-{
-	bit++;
-	while (bit < BM_BITS_PER_CHUNK) {
-		if (test_bit(bit, chunk_p))
-			return bit;
-
-		bit++;
-	}
-	return -1;
-}
-
-/* Find a chunk containing some bits set in given block of bits */
-
-static inline int next_chunk_in_block(int n, struct bm_block *bb)
-{
-	n++;
-	while (n < bb->size) {
-		if (bb->data[n])
-			return n;
-
-		n++;
-	}
-	return -1;
-}
-
 /**
  *	memory_bm_next_pfn - find the pfn that corresponds to the next set bit
  *	in the bitmap @bm.  If the pfn cannot be found, BM_END_OF_MAP is
@@ -571,40 +532,33 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
 {
 	struct zone_bitmap *zone_bm;
 	struct bm_block *bb;
-	int chunk;
 	int bit;
 
 	do {
 		bb = bm->cur.block;
 		do {
-			chunk = bm->cur.chunk;
 			bit = bm->cur.bit;
-			do {
-				bit = next_bit_in_chunk(bit, bb->data + chunk);
-				if (bit >= 0)
-					goto Return_pfn;
-
-				chunk = next_chunk_in_block(chunk, bb);
-				bit = -1;
-			} while (chunk >= 0);
+			bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
+			if (bit < bm_block_bits(bb))
+				goto Return_pfn;
+
 			bb = bb->next;
 			bm->cur.block = bb;
-			memory_bm_reset_chunk(bm);
+			bm->cur.bit = 0;
 		} while (bb);
 		zone_bm = bm->cur.zone_bm->next;
 		if (zone_bm) {
 			bm->cur.zone_bm = zone_bm;
 			bm->cur.block = zone_bm->bm_blocks;
-			memory_bm_reset_chunk(bm);
+			bm->cur.bit = 0;
 		}
 	} while (zone_bm);
 	memory_bm_position_reset(bm);
 	return BM_END_OF_MAP;
 
  Return_pfn:
-	bm->cur.chunk = chunk;
-	bm->cur.bit = bit;
-	return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
+	bm->cur.bit = bit + 1;
+	return bb->start_pfn + bit;
 }
 
 /**
-- 
cgit v1.1


From c1a220e7acf8ad2c03504891f4a70cd9c32c904b Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 23 Jul 2008 21:28:39 -0700
Subject: pm: introduce new interfaces schedule_work_on() and queue_work_on()

This interface allows adding a job on a specific cpu.

Although a work struct on a cpu will be scheduled to other cpu if the cpu
dies, there is a recursion if a work task tries to offline the cpu it's
running on.  we need to schedule the task to a specific cpu in this case.
http://bugzilla.kernel.org/show_bug.cgi?id=10897

[oleg@tv-sign.ru: cleanups]
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Rus <harbour@sfinx.od.ua>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a6d3634..6fd158b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -140,7 +140,6 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 	wake_up(&cwq->more_work);
 }
 
-/* Preempt must be disabled. */
 static void __queue_work(struct cpu_workqueue_struct *cwq,
 			 struct work_struct *work)
 {
@@ -175,6 +174,31 @@ int queue_work(struct workqueue_struct *wq, struct work_struct *work)
 }
 EXPORT_SYMBOL_GPL(queue_work);
 
+/**
+ * queue_work_on - queue work on specific cpu
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @work: work to queue
+ *
+ * Returns 0 if @work was already on a queue, non-zero otherwise.
+ *
+ * We queue the work to a specific CPU, the caller must ensure it
+ * can't go away.
+ */
+int
+queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
+{
+	int ret = 0;
+
+	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
+		BUG_ON(!list_empty(&work->entry));
+		__queue_work(wq_per_cpu(wq, cpu), work);
+		ret = 1;
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(queue_work_on);
+
 static void delayed_work_timer_fn(unsigned long __data)
 {
 	struct delayed_work *dwork = (struct delayed_work *)__data;
@@ -553,6 +577,19 @@ int schedule_work(struct work_struct *work)
 }
 EXPORT_SYMBOL(schedule_work);
 
+/*
+ * schedule_work_on - put work task on a specific cpu
+ * @cpu: cpu to put the work task on
+ * @work: job to be done
+ *
+ * This puts a job on a specific cpu
+ */
+int schedule_work_on(int cpu, struct work_struct *work)
+{
+	return queue_work_on(cpu, keventd_wq, work);
+}
+EXPORT_SYMBOL(schedule_work_on);
+
 /**
  * schedule_delayed_work - put work task in global workqueue after delay
  * @dwork: job to be done
-- 
cgit v1.1


From 2f15fc4bdf91eb399da3f47a09c55831d9f22826 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 23 Jul 2008 21:28:40 -0700
Subject: pm: schedule sysrq poweroff on boot cpu

schedule sysrq poweroff on boot cpu.

sysrq poweroff needs to disable nonboot cpus, and we need to run this on boot
cpu to avoid any recursion.  http://bugzilla.kernel.org/show_bug.cgi?id=10897

[kosaki.motohiro@jp.fujitsu.com: build fix]
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Rus <harbour@sfinx.od.ua>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/poweroff.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 678ec73..72016f0 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -10,6 +10,7 @@
 #include <linux/pm.h>
 #include <linux/workqueue.h>
 #include <linux/reboot.h>
+#include <linux/cpumask.h>
 
 /*
  * When the user hits Sys-Rq o to power down the machine this is the
@@ -25,7 +26,8 @@ static DECLARE_WORK(poweroff_work, do_poweroff);
 
 static void handle_poweroff(int key, struct tty_struct *tty)
 {
-	schedule_work(&poweroff_work);
+	/* run sysrq poweroff on boot cpu */
+	schedule_work_on(first_cpu(cpu_online_map), &poweroff_work);
 }
 
 static struct sysrq_key_op	sysrq_poweroff_op = {
-- 
cgit v1.1


From f0af566da6e9a4a2f5a83c5a70f3d0a772050e21 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 23 Jul 2008 21:28:44 -0700
Subject: pm: fix try_to_freeze_tasks()'s use of do_div()

Fix try_to_freeze_tasks()'s use of do_div() on an s64 by making
elapsed_csecs64 a u64 instead and dividing that.

Possibly this should be guarded lest the interval calculation turn up
negative, but the possible negativity of the result of the division is
cast away anyway.

This was introduced by patch 438e2ce68dfd4af4cfcec2f873564fb921db4bb5.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/process.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index 5fb8765..278946a 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -149,7 +149,7 @@ static int try_to_freeze_tasks(bool sig_only)
 	unsigned long end_time;
 	unsigned int todo;
 	struct timeval start, end;
-	s64 elapsed_csecs64;
+	u64 elapsed_csecs64;
 	unsigned int elapsed_csecs;
 
 	do_gettimeofday(&start);
-- 
cgit v1.1


From 82736f4d1d2b7063b829cc93171a6e5aea8a9c49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <Uwe.Kleine-Koenig@digi.com>
Date: Wed, 23 Jul 2008 21:28:54 -0700
Subject: generic irqs: handle failure of irqchip->set_type in setup_irq
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

set_type returns an int indicating success or failure, but up to now
setup_irq ignores that.

In my case this resulted in a machine hang:

gpio-keys requested IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING, but
arm/ns9xxx can only trigger on one direction so set_type didn't touch
the configuration which happens do default to a level sensitiveness and
returned -EINVAL.  setup_irq ignored that and unmasked the irq.  This
resulted in an endless triggering of the gpio-key interrupt service
routine which effectively killed the machine.

With this patch applied setup_irq propagates the error to the caller.

Note that before in the case

	chip && !chip->set_type && !chip->name

a NULL pointer was feed to printk.  This is fixed, too.

Signed-off-by: Uwe Kleine-König <Uwe.Kleine-Koenig@digi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/manage.c | 64 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 42 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3cfc0fe..5bc6e5e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -308,6 +308,30 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc)
 		desc->handle_irq = NULL;
 }
 
+static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq,
+		unsigned long flags)
+{
+	int ret;
+
+	if (!chip || !chip->set_type) {
+		/*
+		 * IRQF_TRIGGER_* but the PIC does not support multiple
+		 * flow-types?
+		 */
+		pr_warning("No set_type function for IRQ %d (%s)\n", irq,
+				chip ? (chip->name ? : "unknown") : "unknown");
+		return 0;
+	}
+
+	ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK);
+
+	if (ret)
+		pr_err("setting flow type for irq %u failed (%pF)\n",
+				irq, chip->set_type);
+
+	return ret;
+}
+
 /*
  * Internal function to register an irqaction - typically used to
  * allocate special interrupts that are part of the architecture.
@@ -319,6 +343,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 	const char *old_name = NULL;
 	unsigned long flags;
 	int shared = 0;
+	int ret;
 
 	if (irq >= NR_IRQS)
 		return -EINVAL;
@@ -376,35 +401,23 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 		shared = 1;
 	}
 
-	*p = new;
-
-	/* Exclude IRQ from balancing */
-	if (new->flags & IRQF_NOBALANCING)
-		desc->status |= IRQ_NO_BALANCING;
-
 	if (!shared) {
 		irq_chip_set_defaults(desc->chip);
 
-#if defined(CONFIG_IRQ_PER_CPU)
-		if (new->flags & IRQF_PERCPU)
-			desc->status |= IRQ_PER_CPU;
-#endif
-
 		/* Setup the type (level, edge polarity) if configured: */
 		if (new->flags & IRQF_TRIGGER_MASK) {
-			if (desc->chip->set_type)
-				desc->chip->set_type(irq,
-						new->flags & IRQF_TRIGGER_MASK);
-			else
-				/*
-				 * IRQF_TRIGGER_* but the PIC does not support
-				 * multiple flow-types?
-				 */
-				printk(KERN_WARNING "No IRQF_TRIGGER set_type "
-				       "function for IRQ %d (%s)\n", irq,
-				       desc->chip->name);
+			ret = __irq_set_trigger(desc->chip, irq, new->flags);
+
+			if (ret) {
+				spin_unlock_irqrestore(&desc->lock, flags);
+				return ret;
+			}
 		} else
 			compat_irq_chip_set_default_handler(desc);
+#if defined(CONFIG_IRQ_PER_CPU)
+		if (new->flags & IRQF_PERCPU)
+			desc->status |= IRQ_PER_CPU;
+#endif
 
 		desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
 				  IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
@@ -423,6 +436,13 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 		/* Set default affinity mask once everything is setup */
 		irq_select_affinity(irq);
 	}
+
+	*p = new;
+
+	/* Exclude IRQ from balancing */
+	if (new->flags & IRQF_NOBALANCING)
+		desc->status |= IRQ_NO_BALANCING;
+
 	/* Reset broken irq detection when installing new handler */
 	desc->irq_count = 0;
 	desc->irqs_unhandled = 0;
-- 
cgit v1.1


From aaca0bdca573f3f51ea03139f9c7289541e7bca3 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:20 -0700
Subject: flag parameters: paccept

This patch is by far the most complex in the series.  It adds a new syscall
paccept.  This syscall differs from accept in that it adds (at the userlevel)
two additional parameters:

- a signal mask
- a flags value

The flags parameter can be used to set flag like SOCK_CLOEXEC.  This is
imlpemented here as well.  Some people argued that this is a property which
should be inherited from the file desriptor for the server but this is against
POSIX.  Additionally, we really want the signal mask parameter as well
(similar to pselect, ppoll, etc).  So an interface change in inevitable.

The flag value is the same as for socket and socketpair.  I think diverging
here will only create confusion.  Similar to the filesystem interfaces where
the use of the O_* constants differs, it is acceptable here.

The signal mask is handled as for pselect etc.  The mask is temporarily
installed for the thread and removed before the call returns.  I modeled the
code after pselect.  If there is a problem it's likely also in pselect.

For architectures which use socketcall I maintained this interface instead of
adding a system call.  The symmetry shouldn't be broken.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <unistd.h>
#include <netinet/in.h>
#include <sys/socket.h>
#include <sys/syscall.h>

#ifndef __NR_paccept
# ifdef __x86_64__
#  define __NR_paccept 288
# elif defined __i386__
#  define SYS_PACCEPT 18
#  define USE_SOCKETCALL 1
# else
#  error "need __NR_paccept"
# endif
#endif

#ifdef USE_SOCKETCALL
# define paccept(fd, addr, addrlen, mask, flags) \
  ({ long args[6] = { \
       (long) fd, (long) addr, (long) addrlen, (long) mask, 8, (long) flags }; \
     syscall (__NR_socketcall, SYS_PACCEPT, args); })
#else
# define paccept(fd, addr, addrlen, mask, flags) \
  syscall (__NR_paccept, fd, addr, addrlen, mask, 8, flags)
#endif

#define PORT 57392

#define SOCK_CLOEXEC O_CLOEXEC

static pthread_barrier_t b;

static void *
tf (void *arg)
{
  pthread_barrier_wait (&b);
  int s = socket (AF_INET, SOCK_STREAM, 0);
  struct sockaddr_in sin;
  sin.sin_family = AF_INET;
  sin.sin_addr.s_addr = htonl (INADDR_LOOPBACK);
  sin.sin_port = htons (PORT);
  connect (s, (const struct sockaddr *) &sin, sizeof (sin));
  close (s);

  pthread_barrier_wait (&b);
  s = socket (AF_INET, SOCK_STREAM, 0);
  sin.sin_port = htons (PORT);
  connect (s, (const struct sockaddr *) &sin, sizeof (sin));
  close (s);
  pthread_barrier_wait (&b);

  pthread_barrier_wait (&b);
  sleep (2);
  pthread_kill ((pthread_t) arg, SIGUSR1);

  return NULL;
}

static void
handler (int s)
{
}

int
main (void)
{
  pthread_barrier_init (&b, NULL, 2);

  struct sockaddr_in sin;
  pthread_t th;
  if (pthread_create (&th, NULL, tf, (void *) pthread_self ()) != 0)
    {
      puts ("pthread_create failed");
      return 1;
    }

  int s = socket (AF_INET, SOCK_STREAM, 0);
  int reuse = 1;
  setsockopt (s, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof (reuse));
  sin.sin_family = AF_INET;
  sin.sin_addr.s_addr = htonl (INADDR_LOOPBACK);
  sin.sin_port = htons (PORT);
  bind (s, (struct sockaddr *) &sin, sizeof (sin));
  listen (s, SOMAXCONN);

  pthread_barrier_wait (&b);

  int s2 = paccept (s, NULL, 0, NULL, 0);
  if (s2 < 0)
    {
      puts ("paccept(0) failed");
      return 1;
    }

  int coe = fcntl (s2, F_GETFD);
  if (coe & FD_CLOEXEC)
    {
      puts ("paccept(0) set close-on-exec-flag");
      return 1;
    }
  close (s2);

  pthread_barrier_wait (&b);

  s2 = paccept (s, NULL, 0, NULL, SOCK_CLOEXEC);
  if (s2 < 0)
    {
      puts ("paccept(SOCK_CLOEXEC) failed");
      return 1;
    }

  coe = fcntl (s2, F_GETFD);
  if ((coe & FD_CLOEXEC) == 0)
    {
      puts ("paccept(SOCK_CLOEXEC) does not set close-on-exec flag");
      return 1;
    }
  close (s2);

  pthread_barrier_wait (&b);

  struct sigaction sa;
  sa.sa_handler = handler;
  sa.sa_flags = 0;
  sigemptyset (&sa.sa_mask);
  sigaction (SIGUSR1, &sa, NULL);

  sigset_t ss;
  pthread_sigmask (SIG_SETMASK, NULL, &ss);
  sigaddset (&ss, SIGUSR1);
  pthread_sigmask (SIG_SETMASK, &ss, NULL);

  sigdelset (&ss, SIGUSR1);
  alarm (4);
  pthread_barrier_wait (&b);

  errno = 0 ;
  s2 = paccept (s, NULL, 0, &ss, 0);
  if (s2 != -1 || errno != EINTR)
    {
      puts ("paccept did not fail with EINTR");
      return 1;
    }

  close (s);

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[akpm@linux-foundation.org: make it compile]
[akpm@linux-foundation.org: add sys_ni stub]
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: <linux-arch@vger.kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Roland McGrath <roland@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys_ni.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0fea0ee..2f0b8a2 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -31,6 +31,7 @@ cond_syscall(sys_socketpair);
 cond_syscall(sys_bind);
 cond_syscall(sys_listen);
 cond_syscall(sys_accept);
+cond_syscall(sys_paccept);
 cond_syscall(sys_connect);
 cond_syscall(sys_getsockname);
 cond_syscall(sys_getpeername);
-- 
cgit v1.1


From 9deb27baedb79759c3ab9435a7d8b841842d56e9 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:24 -0700
Subject: flag parameters: signalfd

This patch adds the new signalfd4 syscall.  It extends the old signalfd
syscall by one parameter which is meant to hold a flag value.  In this
patch the only flag support is SFD_CLOEXEC which causes the close-on-exec
flag for the returned file descriptor to be set.

A new name SFD_CLOEXEC is introduced which in this implementation must
have the same value as O_CLOEXEC.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <signal.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>

#ifndef __NR_signalfd4
# ifdef __x86_64__
#  define __NR_signalfd4 289
# elif defined __i386__
#  define __NR_signalfd4 327
# else
#  error "need __NR_signalfd4"
# endif
#endif

#define SFD_CLOEXEC O_CLOEXEC

int
main (void)
{
  sigset_t ss;
  sigemptyset (&ss);
  sigaddset (&ss, SIGUSR1);
  int fd = syscall (__NR_signalfd4, -1, &ss, 8, 0);
  if (fd == -1)
    {
      puts ("signalfd4(0) failed");
      return 1;
    }
  int coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if (coe & FD_CLOEXEC)
    {
      puts ("signalfd4(0) set close-on-exec flag");
      return 1;
    }
  close (fd);

  fd = syscall (__NR_signalfd4, -1, &ss, 8, SFD_CLOEXEC);
  if (fd == -1)
    {
      puts ("signalfd4(SFD_CLOEXEC) failed");
      return 1;
    }
  coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if ((coe & FD_CLOEXEC) == 0)
    {
      puts ("signalfd4(SFD_CLOEXEC) does not set close-on-exec flag");
      return 1;
    }
  close (fd);

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[akpm@linux-foundation.org: add sys_ni stub]
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys_ni.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2f0b8a2..8627c89 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -156,6 +156,7 @@ cond_syscall(sys_ioprio_get);
 
 /* New file descriptors */
 cond_syscall(sys_signalfd);
+cond_syscall(sys_signalfd4);
 cond_syscall(compat_sys_signalfd);
 cond_syscall(sys_timerfd_create);
 cond_syscall(sys_timerfd_settime);
-- 
cgit v1.1


From b087498eb5605673b0f260a7620d91818cd72304 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:25 -0700
Subject: flag parameters: eventfd

This patch adds the new eventfd2 syscall.  It extends the old eventfd
syscall by one parameter which is meant to hold a flag value.  In this
patch the only flag support is EFD_CLOEXEC which causes the close-on-exec
flag for the returned file descriptor to be set.

A new name EFD_CLOEXEC is introduced which in this implementation must
have the same value as O_CLOEXEC.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>

#ifndef __NR_eventfd2
# ifdef __x86_64__
#  define __NR_eventfd2 290
# elif defined __i386__
#  define __NR_eventfd2 328
# else
#  error "need __NR_eventfd2"
# endif
#endif

#define EFD_CLOEXEC O_CLOEXEC

int
main (void)
{
  int fd = syscall (__NR_eventfd2, 1, 0);
  if (fd == -1)
    {
      puts ("eventfd2(0) failed");
      return 1;
    }
  int coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if (coe & FD_CLOEXEC)
    {
      puts ("eventfd2(0) sets close-on-exec flag");
      return 1;
    }
  close (fd);

  fd = syscall (__NR_eventfd2, 1, EFD_CLOEXEC);
  if (fd == -1)
    {
      puts ("eventfd2(EFD_CLOEXEC) failed");
      return 1;
    }
  coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if ((coe & FD_CLOEXEC) == 0)
    {
      puts ("eventfd2(EFD_CLOEXEC) does not set close-on-exec flag");
      return 1;
    }
  close (fd);

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[akpm@linux-foundation.org: add sys_ni stub]
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys_ni.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 8627c89..2a361cc 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -164,3 +164,4 @@ cond_syscall(sys_timerfd_gettime);
 cond_syscall(compat_sys_timerfd_settime);
 cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
+cond_syscall(sys_eventfd2);
-- 
cgit v1.1


From 4006553b06306b34054529477b06b68a1c66249b Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:32 -0700
Subject: flag parameters: inotify_init

This patch introduces the new syscall inotify_init1 (note: the 1 stands for
the one parameter the syscall takes, as opposed to no parameter before).  The
values accepted for this parameter are function-specific and defined in the
inotify.h header.  Here the values must match the O_* flags, though.  In this
patch CLOEXEC support is introduced.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>

#ifndef __NR_inotify_init1
# ifdef __x86_64__
#  define __NR_inotify_init1 294
# elif defined __i386__
#  define __NR_inotify_init1 332
# else
#  error "need __NR_inotify_init1"
# endif
#endif

#define IN_CLOEXEC O_CLOEXEC

int
main (void)
{
  int fd;
  fd = syscall (__NR_inotify_init1, 0);
  if (fd == -1)
    {
      puts ("inotify_init1(0) failed");
      return 1;
    }
  int coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if (coe & FD_CLOEXEC)
    {
      puts ("inotify_init1(0) set close-on-exit");
      return 1;
    }
  close (fd);

  fd = syscall (__NR_inotify_init1, IN_CLOEXEC);
  if (fd == -1)
    {
      puts ("inotify_init1(IN_CLOEXEC) failed");
      return 1;
    }
  coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if ((coe & FD_CLOEXEC) == 0)
    {
      puts ("inotify_init1(O_CLOEXEC) does not set close-on-exit");
      return 1;
    }
  close (fd);

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[akpm@linux-foundation.org: add sys_ni stub]
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys_ni.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2a361cc..bd66ac5 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -96,6 +96,7 @@ cond_syscall(sys_keyctl);
 cond_syscall(compat_sys_keyctl);
 cond_syscall(compat_sys_socketcall);
 cond_syscall(sys_inotify_init);
+cond_syscall(sys_inotify_init1);
 cond_syscall(sys_inotify_add_watch);
 cond_syscall(sys_inotify_rm_watch);
 cond_syscall(sys_migrate_pages);
-- 
cgit v1.1


From be61a86d7237dd80510615f38ae21d6e1e98660c Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:40 -0700
Subject: flag parameters: NONBLOCK in pipe

This patch adds O_NONBLOCK support to pipe2.  It is minimally more involved
than the patches for eventfd et.al but still trivial.  The interfaces of the
create_write_pipe and create_read_pipe helper functions were changed and the
one other caller as well.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>

#ifndef __NR_pipe2
# ifdef __x86_64__
#  define __NR_pipe2 293
# elif defined __i386__
#  define __NR_pipe2 331
# else
#  error "need __NR_pipe2"
# endif
#endif

int
main (void)
{
  int fds[2];
  if (syscall (__NR_pipe2, fds, 0) == -1)
    {
      puts ("pipe2(0) failed");
      return 1;
    }
  for (int i = 0; i < 2; ++i)
    {
      int fl = fcntl (fds[i], F_GETFL);
      if (fl == -1)
        {
          puts ("fcntl failed");
          return 1;
        }
      if (fl & O_NONBLOCK)
        {
          printf ("pipe2(0) set non-blocking mode for fds[%d]\n", i);
          return 1;
        }
      close (fds[i]);
    }

  if (syscall (__NR_pipe2, fds, O_NONBLOCK) == -1)
    {
      puts ("pipe2(O_NONBLOCK) failed");
      return 1;
    }
  for (int i = 0; i < 2; ++i)
    {
      int fl = fcntl (fds[i], F_GETFL);
      if (fl == -1)
        {
          puts ("fcntl failed");
          return 1;
        }
      if ((fl & O_NONBLOCK) == 0)
        {
          printf ("pipe2(O_NONBLOCK) does not set non-blocking mode for fds[%d]\n", i);
          return 1;
        }
      close (fds[i]);
    }

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 90d7af1..2989f67 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -417,12 +417,12 @@ int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
 {
 	struct file *f;
 
-	f = create_write_pipe();
+	f = create_write_pipe(0);
 	if (IS_ERR(f))
 		return PTR_ERR(f);
 	*filp = f;
 
-	f = create_read_pipe(f);
+	f = create_read_pipe(f, 0);
 	if (IS_ERR(f)) {
 		free_write_pipe(*filp);
 		return PTR_ERR(f);
-- 
cgit v1.1


From 5df439ef06d4173357711a04740aa8bfcf50d621 Mon Sep 17 00:00:00 2001
From: Wang Chen <wangchen@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:45:23 -0700
Subject: flag parameters: fix compile error of sys_epoll_create1

GEN     .version
  CHK     include/linux/compile.h
  UPD     include/linux/compile.h
  CC      init/version.o
  LD      init/built-in.o
  LD      vmlinux
arch/x86/kernel/built-in.o: In function `sys_call_table':
(.rodata+0x8a4): undefined reference to `sys_epoll_create1'
make: *** [vmlinux] Error 1

Signed-off-by: Wang Chen <wangchen@cn.fujitsu.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys_ni.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index bd66ac5..55eca15 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -57,6 +57,7 @@ cond_syscall(compat_sys_set_robust_list);
 cond_syscall(sys_get_robust_list);
 cond_syscall(compat_sys_get_robust_list);
 cond_syscall(sys_epoll_create);
+cond_syscall(sys_epoll_create1);
 cond_syscall(sys_epoll_ctl);
 cond_syscall(sys_epoll_wait);
 cond_syscall(sys_epoll_pwait);
-- 
cgit v1.1


From 2fc9c4e18f94431e7eb77d97edb2a995b46fba55 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Fri, 25 Jul 2008 01:45:34 -0700
Subject: kallsyms: fix potential overflow in binary search

This will probably never trigger... but it won't hurt to be careful.

http://googleresearch.blogspot.com/2006/06/extra-extra-read-all-about-it-nearly.html

Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Joshua Bloch <jjb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kallsyms.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6fc0040..38fc10a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -176,7 +176,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
 	high = kallsyms_num_syms;
 
 	while (high - low > 1) {
-		mid = (low + high) / 2;
+		mid = low + (high - low) / 2;
 		if (kallsyms_addresses[mid] <= addr)
 			low = mid;
 		else
-- 
cgit v1.1


From b03f6489f9f27dc519a4c60ebf39cc7b8a58eae7 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 25 Jul 2008 01:45:35 -0700
Subject: build kernel/profile.o only when requested

Build kernel/profile.o only if CONFIG_PROFILING is enabled.

This makes CONFIG_PROFILING=n kernels smaller.

As a bonus, some profile_tick() calls and one branch from schedule() are
now eliminated with CONFIG_PROFILING=n (but I doubt these are
measurable effects).

This patch changes the effects of CONFIG_PROFILING=n, but I don't think
having more than two choices would be the better choice.

This patch also adds the name of the first parameter to the prototypes
of profile_{hits,tick}() since I anyway had to add them for the dummy
functions.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/Makefile  | 3 ++-
 kernel/profile.c | 4 ----
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 15ab63f..54f6983 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the linux kernel.
 #
 
-obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
+obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
 	    cpu.o exit.o itimer.o time.o softirq.o resource.o \
 	    sysctl.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
@@ -24,6 +24,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_sched.o = -mno-spe -pg
 endif
 
+obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
diff --git a/kernel/profile.c b/kernel/profile.c
index 5892641..cd26bed 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -112,8 +112,6 @@ void __init profile_init(void)
 
 /* Profile event notifications */
 
-#ifdef CONFIG_PROFILING
-
 static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
 static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
 static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
@@ -203,8 +201,6 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *))
 }
 EXPORT_SYMBOL_GPL(unregister_timer_hook);
 
-#endif /* CONFIG_PROFILING */
-
 
 #ifdef CONFIG_SMP
 /*
-- 
cgit v1.1


From ac331d158e198d2a91a5b0a3ec4ca9991fdb57af Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 25 Jul 2008 01:45:38 -0700
Subject: call_usermodehelper(): increase reliability

Presently call_usermodehelper_setup() uses GFP_ATOMIC.  but it can return
NULL _very_ easily.

GFP_ATOMIC is needed only when we can't sleep.  and, GFP_KERNEL is robust
and better.

thus, I add gfp_mask argument to call_usermodehelper_setup().

So, its callers pass the gfp_t as below:

call_usermodehelper() and call_usermodehelper_keys():
	depend on 'wait' argument.
call_usermodehelper_pipe():
	always GFP_KERNEL because always run under process context.
orderly_poweroff():
	pass to GFP_ATOMIC because may run under interrupt context.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: "Paul Menage" <menage@google.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 9 +++++----
 kernel/sys.c  | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2989f67..2456d1a 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -352,16 +352,17 @@ static inline void register_pm_notifier_callback(void) {}
  * @path: path to usermode executable
  * @argv: arg vector for process
  * @envp: environment for process
+ * @gfp_mask: gfp mask for memory allocation
  *
  * Returns either %NULL on allocation failure, or a subprocess_info
  * structure.  This should be passed to call_usermodehelper_exec to
  * exec the process and free the structure.
  */
-struct subprocess_info *call_usermodehelper_setup(char *path,
-						  char **argv, char **envp)
+struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
+						  char **envp, gfp_t gfp_mask)
 {
 	struct subprocess_info *sub_info;
-	sub_info = kzalloc(sizeof(struct subprocess_info),  GFP_ATOMIC);
+	sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
 	if (!sub_info)
 		goto out;
 
@@ -494,7 +495,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
 	struct subprocess_info *sub_info;
 	int ret;
 
-	sub_info = call_usermodehelper_setup(path, argv, envp);
+	sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL);
 	if (sub_info == NULL)
 		return -ENOMEM;
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 14e9728..6c21880 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1795,7 +1795,7 @@ int orderly_poweroff(bool force)
 		goto out;
 	}
 
-	info = call_usermodehelper_setup(argv[0], argv, envp);
+	info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC);
 	if (info == NULL) {
 		argv_free(argv);
 		goto out;
-- 
cgit v1.1


From b69c49b78457f681ecfb3147bd968434ee6559c1 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Fri, 25 Jul 2008 01:45:40 -0700
Subject: clean up duplicated alloc/free_thread_info

We duplicate alloc/free_thread_info defines on many platforms (the
majority uses __get_free_pages/free_pages).  This patch defines common
defines and removes these duplicated defines.
__HAVE_ARCH_THREAD_INFO_ALLOCATOR is introduced for platforms that do
something different.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 552c8d8..5a5d6fe 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -93,6 +93,23 @@ int nr_processes(void)
 static struct kmem_cache *task_struct_cachep;
 #endif
 
+#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
+static inline struct thread_info *alloc_thread_info(struct task_struct *tsk)
+{
+#ifdef CONFIG_DEBUG_STACK_USAGE
+	gfp_t mask = GFP_KERNEL | __GFP_ZERO;
+#else
+	gfp_t mask = GFP_KERNEL;
+#endif
+	return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER);
+}
+
+static inline void free_thread_info(struct thread_info *ti)
+{
+	free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+}
+#endif
+
 /* SLAB cache for signal_struct structures (tsk->signal) */
 static struct kmem_cache *signal_cachep;
 
-- 
cgit v1.1


From a8f18b909c0a3f22630846207035c8b84bb252b8 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jul 2008 01:45:53 -0700
Subject: Add a WARN() macro; this is WARN_ON() + printk arguments

Add a WARN() macro that acts like WARN_ON(), with the added feature that it
takes a printk like argument that is printed as part of the warning message.

[akpm@linux-foundation.org: fix printk arguments]
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Greg KH <greg@kroah.com>
Cc: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/panic.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 425567f..12c5a0a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -318,6 +318,28 @@ void warn_on_slowpath(const char *file, int line)
 	add_taint(TAINT_WARN);
 }
 EXPORT_SYMBOL(warn_on_slowpath);
+
+
+void warn_slowpath(const char *file, int line, const char *fmt, ...)
+{
+	va_list args;
+	char function[KSYM_SYMBOL_LEN];
+	unsigned long caller = (unsigned long)__builtin_return_address(0);
+	sprint_symbol(function, caller);
+
+	printk(KERN_WARNING "------------[ cut here ]------------\n");
+	printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
+		line, function);
+	va_start(args, fmt);
+	vprintk(fmt, args);
+	va_end(args);
+
+	print_modules();
+	dump_stack();
+	print_oops_end_marker();
+	add_taint(TAINT_WARN);
+}
+EXPORT_SYMBOL(warn_slowpath);
 #endif
 
 #ifdef CONFIG_CC_STACKPROTECTOR
-- 
cgit v1.1


From 7a2c477069fbd32f91598f05334003979b987a39 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jul 2008 01:45:54 -0700
Subject: kernel/irq/manage.c: replace a printk + WARN_ON() to a WARN()

Replace a printk+WARN_ON() by a WARN(); this increases the chance of the
string making it into the bugreport (ie: it goes inside the
---[ cut here ]--- section)

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/manage.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5bc6e5e..f8914b9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -260,9 +260,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
 		}
 	} else {
 		if (desc->wake_depth == 0) {
-			printk(KERN_WARNING "Unbalanced IRQ %d "
-					"wake disable\n", irq);
-			WARN_ON(1);
+			WARN(1, "Unbalanced IRQ %d wake disable\n", irq);
 		} else if (--desc->wake_depth == 0) {
 			ret = set_irq_wake_real(irq, on);
 			if (ret)
-- 
cgit v1.1


From 717115e1a5856b57af0f71e1df7149108294fc10 Mon Sep 17 00:00:00 2001
From: Dave Young <hidave.darkstar@gmail.com>
Date: Fri, 25 Jul 2008 01:45:58 -0700
Subject: printk ratelimiting rewrite

All ratelimit user use same jiffies and burst params, so some messages
(callbacks) will be lost.

For example:
a call printk_ratelimit(5 * HZ, 1)
b call printk_ratelimit(5 * HZ, 1) before the 5*HZ timeout of a, then b will
will be supressed.

- rewrite __ratelimit, and use a ratelimit_state as parameter.  Thanks for
  hints from andrew.

- Add WARN_ON_RATELIMIT, update rcupreempt.h

- remove __printk_ratelimit

- use __ratelimit in net_ratelimit

Signed-off-by: Dave Young <hidave.darkstar@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Dave Young <hidave.darkstar@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 17 +++--------------
 kernel/sysctl.c |  4 ++--
 2 files changed, 5 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 3f7a2a9..a7f7559 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1308,6 +1308,8 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 }
 
 #if defined CONFIG_PRINTK
+
+DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
 /*
  * printk rate limiting, lifted from the networking subsystem.
  *
@@ -1315,22 +1317,9 @@ void tty_write_message(struct tty_struct *tty, char *msg)
  * every printk_ratelimit_jiffies to make a denial-of-service
  * attack impossible.
  */
-int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
-{
-	return __ratelimit(ratelimit_jiffies, ratelimit_burst);
-}
-EXPORT_SYMBOL(__printk_ratelimit);
-
-/* minimum time in jiffies between messages */
-int printk_ratelimit_jiffies = 5 * HZ;
-
-/* number of messages we send before ratelimiting */
-int printk_ratelimit_burst = 10;
-
 int printk_ratelimit(void)
 {
-	return __printk_ratelimit(printk_ratelimit_jiffies,
-				printk_ratelimit_burst);
+	return __ratelimit(&printk_ratelimit_state);
 }
 EXPORT_SYMBOL(printk_ratelimit);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1a8299d..35a50db 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -624,7 +624,7 @@ static struct ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_PRINTK_RATELIMIT,
 		.procname	= "printk_ratelimit",
-		.data		= &printk_ratelimit_jiffies,
+		.data		= &printk_ratelimit_state.interval,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_jiffies,
@@ -633,7 +633,7 @@ static struct ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_PRINTK_RATELIMIT_BURST,
 		.procname	= "printk_ratelimit_burst",
-		.data		= &printk_ratelimit_burst,
+		.data		= &printk_ratelimit_state.burst,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
-- 
cgit v1.1


From ef53d9c5e4da147ecaa43c44c5e5945eb83970a2 Mon Sep 17 00:00:00 2001
From: Srinivasa D S <srinivasa@in.ibm.com>
Date: Fri, 25 Jul 2008 01:46:04 -0700
Subject: kprobes: improve kretprobe scalability with hashed locking

Currently list of kretprobe instances are stored in kretprobe object (as
used_instances,free_instances) and in kretprobe hash table.  We have one
global kretprobe lock to serialise the access to these lists.  This causes
only one kretprobe handler to execute at a time.  Hence affects system
performance, particularly on SMP systems and when return probe is set on
lot of functions (like on all systemcalls).

Solution proposed here gives fine-grain locks that performs better on SMP
system compared to present kretprobe implementation.

Solution:

 1) Instead of having one global lock to protect kretprobe instances
    present in kretprobe object and kretprobe hash table.  We will have
    two locks, one lock for protecting kretprobe hash table and another
    lock for kretporbe object.

 2) We hold lock present in kretprobe object while we modify kretprobe
    instance in kretprobe object and we hold per-hash-list lock while
    modifying kretprobe instances present in that hash list.  To prevent
    deadlock, we never grab a per-hash-list lock while holding a kretprobe
    lock.

 3) We can remove used_instances from struct kretprobe, as we can
    track used instances of kretprobe instances using kretprobe hash
    table.

Time duration for kernel compilation ("make -j 8") on a 8-way ppc64 system
with return probes set on all systemcalls looks like this.

cacheline              non-cacheline             Un-patched kernel
aligned patch 	       aligned patch
===============================================================================
real    9m46.784s       9m54.412s                  10m2.450s
user    40m5.715s       40m7.142s                  40m4.273s
sys     2m57.754s       2m58.583s                  3m17.430s
===========================================================

Time duration for kernel compilation ("make -j 8) on the same system, when
kernel is not probed.
=========================
real    9m26.389s
user    40m8.775s
sys     2m7.283s
=========================

Signed-off-by: Srinivasa DS <srinivasa@in.ibm.com>
Signed-off-by: Jim Keniston <jkenisto@us.ibm.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 127 ++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 89 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1485ca8..cb0b3bd 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -62,6 +62,7 @@
 	addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
 #endif
 
+static int kprobes_initialized;
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 
@@ -69,8 +70,15 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 static bool kprobe_enabled;
 
 DEFINE_MUTEX(kprobe_mutex);		/* Protects kprobe_table */
-DEFINE_SPINLOCK(kretprobe_lock);	/* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
+static struct {
+	spinlock_t lock ____cacheline_aligned;
+} kretprobe_table_locks[KPROBE_TABLE_SIZE];
+
+static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
+{
+	return &(kretprobe_table_locks[hash].lock);
+}
 
 /*
  * Normally, functions that we'd want to prohibit kprobes in, are marked
@@ -368,26 +376,53 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
 	return;
 }
 
-/* Called with kretprobe_lock held */
 void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
 				struct hlist_head *head)
 {
+	struct kretprobe *rp = ri->rp;
+
 	/* remove rp inst off the rprobe_inst_table */
 	hlist_del(&ri->hlist);
-	if (ri->rp) {
-		/* remove rp inst off the used list */
-		hlist_del(&ri->uflist);
-		/* put rp inst back onto the free list */
-		INIT_HLIST_NODE(&ri->uflist);
-		hlist_add_head(&ri->uflist, &ri->rp->free_instances);
+	INIT_HLIST_NODE(&ri->hlist);
+	if (likely(rp)) {
+		spin_lock(&rp->lock);
+		hlist_add_head(&ri->hlist, &rp->free_instances);
+		spin_unlock(&rp->lock);
 	} else
 		/* Unregistering */
 		hlist_add_head(&ri->hlist, head);
 }
 
-struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
+void kretprobe_hash_lock(struct task_struct *tsk,
+			 struct hlist_head **head, unsigned long *flags)
+{
+	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
+	spinlock_t *hlist_lock;
+
+	*head = &kretprobe_inst_table[hash];
+	hlist_lock = kretprobe_table_lock_ptr(hash);
+	spin_lock_irqsave(hlist_lock, *flags);
+}
+
+void kretprobe_table_lock(unsigned long hash, unsigned long *flags)
 {
-	return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
+	spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+	spin_lock_irqsave(hlist_lock, *flags);
+}
+
+void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags)
+{
+	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
+	spinlock_t *hlist_lock;
+
+	hlist_lock = kretprobe_table_lock_ptr(hash);
+	spin_unlock_irqrestore(hlist_lock, *flags);
+}
+
+void kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
+{
+	spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+	spin_unlock_irqrestore(hlist_lock, *flags);
 }
 
 /*
@@ -401,17 +436,21 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
 	struct kretprobe_instance *ri;
 	struct hlist_head *head, empty_rp;
 	struct hlist_node *node, *tmp;
-	unsigned long flags = 0;
+	unsigned long hash, flags = 0;
 
-	INIT_HLIST_HEAD(&empty_rp);
-	spin_lock_irqsave(&kretprobe_lock, flags);
-	head = kretprobe_inst_table_head(tk);
+	if (unlikely(!kprobes_initialized))
+		/* Early boot.  kretprobe_table_locks not yet initialized. */
+		return;
+
+	hash = hash_ptr(tk, KPROBE_HASH_BITS);
+	head = &kretprobe_inst_table[hash];
+	kretprobe_table_lock(hash, &flags);
 	hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
 		if (ri->task == tk)
 			recycle_rp_inst(ri, &empty_rp);
 	}
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
-
+	kretprobe_table_unlock(hash, &flags);
+	INIT_HLIST_HEAD(&empty_rp);
 	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
 		hlist_del(&ri->hlist);
 		kfree(ri);
@@ -423,24 +462,29 @@ static inline void free_rp_inst(struct kretprobe *rp)
 	struct kretprobe_instance *ri;
 	struct hlist_node *pos, *next;
 
-	hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, uflist) {
-		hlist_del(&ri->uflist);
+	hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) {
+		hlist_del(&ri->hlist);
 		kfree(ri);
 	}
 }
 
 static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
 {
-	unsigned long flags;
+	unsigned long flags, hash;
 	struct kretprobe_instance *ri;
 	struct hlist_node *pos, *next;
+	struct hlist_head *head;
+
 	/* No race here */
-	spin_lock_irqsave(&kretprobe_lock, flags);
-	hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) {
-		ri->rp = NULL;
-		hlist_del(&ri->uflist);
+	for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
+		kretprobe_table_lock(hash, &flags);
+		head = &kretprobe_inst_table[hash];
+		hlist_for_each_entry_safe(ri, pos, next, head, hlist) {
+			if (ri->rp == rp)
+				ri->rp = NULL;
+		}
+		kretprobe_table_unlock(hash, &flags);
 	}
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
 	free_rp_inst(rp);
 }
 
@@ -831,32 +875,37 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 					   struct pt_regs *regs)
 {
 	struct kretprobe *rp = container_of(p, struct kretprobe, kp);
-	unsigned long flags = 0;
+	unsigned long hash, flags = 0;
+	struct kretprobe_instance *ri;
 
 	/*TODO: consider to only swap the RA after the last pre_handler fired */
-	spin_lock_irqsave(&kretprobe_lock, flags);
+	hash = hash_ptr(current, KPROBE_HASH_BITS);
+	spin_lock_irqsave(&rp->lock, flags);
 	if (!hlist_empty(&rp->free_instances)) {
-		struct kretprobe_instance *ri;
-
 		ri = hlist_entry(rp->free_instances.first,
-				 struct kretprobe_instance, uflist);
+				struct kretprobe_instance, hlist);
+		hlist_del(&ri->hlist);
+		spin_unlock_irqrestore(&rp->lock, flags);
+
 		ri->rp = rp;
 		ri->task = current;
 
 		if (rp->entry_handler && rp->entry_handler(ri, regs)) {
-			spin_unlock_irqrestore(&kretprobe_lock, flags);
+			spin_unlock_irqrestore(&rp->lock, flags);
 			return 0;
 		}
 
 		arch_prepare_kretprobe(ri, regs);
 
 		/* XXX(hch): why is there no hlist_move_head? */
-		hlist_del(&ri->uflist);
-		hlist_add_head(&ri->uflist, &ri->rp->used_instances);
-		hlist_add_head(&ri->hlist, kretprobe_inst_table_head(ri->task));
-	} else
+		INIT_HLIST_NODE(&ri->hlist);
+		kretprobe_table_lock(hash, &flags);
+		hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
+		kretprobe_table_unlock(hash, &flags);
+	} else {
 		rp->nmissed++;
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
+		spin_unlock_irqrestore(&rp->lock, flags);
+	}
 	return 0;
 }
 
@@ -892,7 +941,7 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
 		rp->maxactive = NR_CPUS;
 #endif
 	}
-	INIT_HLIST_HEAD(&rp->used_instances);
+	spin_lock_init(&rp->lock);
 	INIT_HLIST_HEAD(&rp->free_instances);
 	for (i = 0; i < rp->maxactive; i++) {
 		inst = kmalloc(sizeof(struct kretprobe_instance) +
@@ -901,8 +950,8 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
 			free_rp_inst(rp);
 			return -ENOMEM;
 		}
-		INIT_HLIST_NODE(&inst->uflist);
-		hlist_add_head(&inst->uflist, &rp->free_instances);
+		INIT_HLIST_NODE(&inst->hlist);
+		hlist_add_head(&inst->hlist, &rp->free_instances);
 	}
 
 	rp->nmissed = 0;
@@ -1009,6 +1058,7 @@ static int __init init_kprobes(void)
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		INIT_HLIST_HEAD(&kprobe_table[i]);
 		INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
+		spin_lock_init(&(kretprobe_table_locks[i].lock));
 	}
 
 	/*
@@ -1050,6 +1100,7 @@ static int __init init_kprobes(void)
 	err = arch_init_kprobes();
 	if (!err)
 		err = register_die_notifier(&kprobe_exceptions_nb);
+	kprobes_initialized = (err == 0);
 
 	if (!err)
 		init_test_probes();
-- 
cgit v1.1


From 8b6dd986823a8d92ed9f54baa5cef8604d9d9d44 Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Fri, 25 Jul 2008 01:46:05 -0700
Subject: kprobes: remove redundant config check

I noticed that there's a CONFIG_KPROBES check inside kernel/kprobes.c,
which is redundant.

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index cb0b3bd..75bc2cd 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1337,13 +1337,8 @@ EXPORT_SYMBOL_GPL(register_jprobe);
 EXPORT_SYMBOL_GPL(unregister_jprobe);
 EXPORT_SYMBOL_GPL(register_jprobes);
 EXPORT_SYMBOL_GPL(unregister_jprobes);
-#ifdef CONFIG_KPROBES
 EXPORT_SYMBOL_GPL(jprobe_return);
-#endif
-
-#ifdef CONFIG_KPROBES
 EXPORT_SYMBOL_GPL(register_kretprobe);
 EXPORT_SYMBOL_GPL(unregister_kretprobe);
 EXPORT_SYMBOL_GPL(register_kretprobes);
 EXPORT_SYMBOL_GPL(unregister_kretprobes);
-#endif
-- 
cgit v1.1


From 7e9abd89cbdf9b73d327d8173343abce9022609b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:46:54 -0700
Subject: cgroup: use read lock to guard find_existing_css_set()

The function does not modify anything (except the temporary css template), so
it's sufficient to hold read lock.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 15ac0e1..f50edad 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -415,11 +415,11 @@ static struct css_set *find_css_set(
 
 	/* First see if we already have a cgroup group that matches
 	 * the desired set */
-	write_lock(&css_set_lock);
+	read_lock(&css_set_lock);
 	res = find_existing_css_set(oldcg, cgrp, template);
 	if (res)
 		get_css_set(res);
-	write_unlock(&css_set_lock);
+	read_unlock(&css_set_lock);
 
 	if (res)
 		return res;
-- 
cgit v1.1


From 71cbb949d17d4d776abd547135feb7f3282405c8 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 25 Jul 2008 01:46:55 -0700
Subject: cgroup: list_for_each cleanup

--------------------------
while() {
	list_entry();
	...
}
--------------------------

is equivalent to following code.

--------------------------
list_for_each_entry(){
	...
}
--------------------------

later can review easily more.

this patch is just clean up.
it doesn't have any behavor change.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 44 +++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f50edad..6836a90 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -241,17 +241,20 @@ static int use_task_css_set_links;
  */
 static void unlink_css_set(struct css_set *cg)
 {
+	struct cg_cgroup_link *link;
+	struct cg_cgroup_link *saved_link;
+
 	write_lock(&css_set_lock);
 	hlist_del(&cg->hlist);
 	css_set_count--;
-	while (!list_empty(&cg->cg_links)) {
-		struct cg_cgroup_link *link;
-		link = list_entry(cg->cg_links.next,
-				  struct cg_cgroup_link, cg_link_list);
+
+	list_for_each_entry_safe(link, saved_link, &cg->cg_links,
+				 cg_link_list) {
 		list_del(&link->cg_link_list);
 		list_del(&link->cgrp_link_list);
 		kfree(link);
 	}
+
 	write_unlock(&css_set_lock);
 }
 
@@ -363,15 +366,14 @@ static struct css_set *find_existing_css_set(
 static int allocate_cg_links(int count, struct list_head *tmp)
 {
 	struct cg_cgroup_link *link;
+	struct cg_cgroup_link *saved_link;
 	int i;
 	INIT_LIST_HEAD(tmp);
 	for (i = 0; i < count; i++) {
 		link = kmalloc(sizeof(*link), GFP_KERNEL);
 		if (!link) {
-			while (!list_empty(tmp)) {
-				link = list_entry(tmp->next,
-						  struct cg_cgroup_link,
-						  cgrp_link_list);
+			list_for_each_entry_safe(link, saved_link, tmp,
+						 cgrp_link_list) {
 				list_del(&link->cgrp_link_list);
 				kfree(link);
 			}
@@ -384,11 +386,10 @@ static int allocate_cg_links(int count, struct list_head *tmp)
 
 static void free_cg_links(struct list_head *tmp)
 {
-	while (!list_empty(tmp)) {
-		struct cg_cgroup_link *link;
-		link = list_entry(tmp->next,
-				  struct cg_cgroup_link,
-				  cgrp_link_list);
+	struct cg_cgroup_link *link;
+	struct cg_cgroup_link *saved_link;
+
+	list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
 		list_del(&link->cgrp_link_list);
 		kfree(link);
 	}
@@ -1093,6 +1094,8 @@ static void cgroup_kill_sb(struct super_block *sb) {
 	struct cgroupfs_root *root = sb->s_fs_info;
 	struct cgroup *cgrp = &root->top_cgroup;
 	int ret;
+	struct cg_cgroup_link *link;
+	struct cg_cgroup_link *saved_link;
 
 	BUG_ON(!root);
 
@@ -1112,10 +1115,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
 	 * root cgroup
 	 */
 	write_lock(&css_set_lock);
-	while (!list_empty(&cgrp->css_sets)) {
-		struct cg_cgroup_link *link;
-		link = list_entry(cgrp->css_sets.next,
-				  struct cg_cgroup_link, cgrp_link_list);
+
+	list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
+				 cgrp_link_list) {
 		list_del(&link->cg_link_list);
 		list_del(&link->cgrp_link_list);
 		kfree(link);
@@ -1756,15 +1758,11 @@ int cgroup_add_files(struct cgroup *cgrp,
 int cgroup_task_count(const struct cgroup *cgrp)
 {
 	int count = 0;
-	struct list_head *l;
+	struct cg_cgroup_link *link;
 
 	read_lock(&css_set_lock);
-	l = cgrp->css_sets.next;
-	while (l != &cgrp->css_sets) {
-		struct cg_cgroup_link *link =
-			list_entry(l, struct cg_cgroup_link, cgrp_link_list);
+	list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
 		count += atomic_read(&link->cg->ref.refcount);
-		l = l->next;
 	}
 	read_unlock(&css_set_lock);
 	return count;
-- 
cgit v1.1


From 8947f9d5b361ce927be6d5c11fed57905b7a4100 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:46:56 -0700
Subject: cgroups: annotate two variables with __read_mostly

- need_forkexit_callback will be read only after system boot.
- use_task_css_set_links will be read only after it's set.

And these 2 variables are checked when a new process is forked.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 6836a90..70d083c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -118,7 +118,7 @@ static int root_count;
  * extra work in the fork/exit path if none of the subsystems need to
  * be called.
  */
-static int need_forkexit_callback;
+static int need_forkexit_callback __read_mostly;
 static int need_mm_owner_callback __read_mostly;
 
 /* convenient tests for these bits */
@@ -220,7 +220,7 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
  * task until after the first call to cgroup_iter_start(). This
  * reduces the fork()/exit() overhead for people who have cgroups
  * compiled into their kernel but not actually in use */
-static int use_task_css_set_links;
+static int use_task_css_set_links __read_mostly;
 
 /* When we create or destroy a css_set, the operation simply
  * takes/releases a reference count on all the cgroups referenced
-- 
cgit v1.1


From db3b14978abc02041046ed8353f0899cb58ffffc Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Fri, 25 Jul 2008 01:46:58 -0700
Subject: cgroup files: add write_string cgroup control file method

This patch adds a write_string() method for cgroups control files. The
semantics are that a buffer is copied from userspace to kernelspace
and the handler function invoked on that buffer.  The buffer is
guaranteed to be nul-terminated, and no longer than max_write_len
(defaulting to 64 bytes if unspecified). Later patches will convert
existing raw file write handlers in control group subsystems to use
this method.

Signed-off-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Acked-by: Balbir Singh <balbir@in.ibm.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 70d083c..3a99cc2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1363,6 +1363,39 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
 	return retval;
 }
 
+static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
+				   struct file *file,
+				   const char __user *userbuf,
+				   size_t nbytes, loff_t *unused_ppos)
+{
+	char local_buffer[64];
+	int retval = 0;
+	size_t max_bytes = cft->max_write_len;
+	char *buffer = local_buffer;
+
+	if (!max_bytes)
+		max_bytes = sizeof(local_buffer) - 1;
+	if (nbytes >= max_bytes)
+		return -E2BIG;
+	/* Allocate a dynamic buffer if we need one */
+	if (nbytes >= sizeof(local_buffer)) {
+		buffer = kmalloc(nbytes + 1, GFP_KERNEL);
+		if (buffer == NULL)
+			return -ENOMEM;
+	}
+	if (nbytes && copy_from_user(buffer, userbuf, nbytes))
+		return -EFAULT;
+
+	buffer[nbytes] = 0;     /* nul-terminate */
+	strstrip(buffer);
+	retval = cft->write_string(cgrp, cft, buffer);
+	if (!retval)
+		retval = nbytes;
+	if (buffer != local_buffer)
+		kfree(buffer);
+	return retval;
+}
+
 static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
 					   struct cftype *cft,
 					   struct file *file,
@@ -1440,6 +1473,8 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
 		return cft->write(cgrp, cft, file, buf, nbytes, ppos);
 	if (cft->write_u64 || cft->write_s64)
 		return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
+	if (cft->write_string)
+		return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
 	if (cft->trigger) {
 		int ret = cft->trigger(cgrp, (unsigned int)cft->private);
 		return ret ? ret : nbytes;
-- 
cgit v1.1


From e788e066c651b1bbf4a927dc95395c1aa13be436 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Fri, 25 Jul 2008 01:46:59 -0700
Subject: cgroup files: move the release_agent file to use typed handlers

Adds cgroup_release_agent_write() and cgroup_release_agent_show()
methods to handle writing/reading the path to a cgroup hierarchy's
release agent. As a result, cgroup_common_file_read() is now unnecessary.

As part of the change, a previously-tolerated race in
cgroup_release_agent() is avoided by copying the current
release_agent_path prior to calling call_usermode_helper().

Signed-off-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 125 ++++++++++++++++++++++++++------------------------------
 1 file changed, 57 insertions(+), 68 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3a99cc2..0120b5d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -89,11 +89,7 @@ struct cgroupfs_root {
 	/* Hierarchy-specific flags */
 	unsigned long flags;
 
-	/* The path to use for release notifications. No locking
-	 * between setting and use - so if userspace updates this
-	 * while child cgroups exist, you could miss a
-	 * notification. We ensure that it's always a valid
-	 * NUL-terminated string */
+	/* The path to use for release notifications. */
 	char release_agent_path[PATH_MAX];
 };
 
@@ -1329,6 +1325,45 @@ enum cgroup_filetype {
 	FILE_RELEASE_AGENT,
 };
 
+/**
+ * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
+ * @cgrp: the cgroup to be checked for liveness
+ *
+ * Returns true (with lock held) on success, or false (with no lock
+ * held) on failure.
+ */
+int cgroup_lock_live_group(struct cgroup *cgrp)
+{
+	mutex_lock(&cgroup_mutex);
+	if (cgroup_is_removed(cgrp)) {
+		mutex_unlock(&cgroup_mutex);
+		return false;
+	}
+	return true;
+}
+
+static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
+				      const char *buffer)
+{
+	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+	if (!cgroup_lock_live_group(cgrp))
+		return -ENODEV;
+	strcpy(cgrp->root->release_agent_path, buffer);
+	mutex_unlock(&cgroup_mutex);
+	return 0;
+}
+
+static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
+				     struct seq_file *seq)
+{
+	if (!cgroup_lock_live_group(cgrp))
+		return -ENODEV;
+	seq_puts(seq, cgrp->root->release_agent_path);
+	seq_putc(seq, '\n');
+	mutex_unlock(&cgroup_mutex);
+	return 0;
+}
+
 static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
 				struct file *file,
 				const char __user *userbuf,
@@ -1443,10 +1478,6 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
 		else
 			clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 		break;
-	case FILE_RELEASE_AGENT:
-		BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
-		strcpy(cgrp->root->release_agent_path, buffer);
-		break;
 	default:
 		retval = -EINVAL;
 		goto out2;
@@ -1506,49 +1537,6 @@ static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
 	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
 
-static ssize_t cgroup_common_file_read(struct cgroup *cgrp,
-					  struct cftype *cft,
-					  struct file *file,
-					  char __user *buf,
-					  size_t nbytes, loff_t *ppos)
-{
-	enum cgroup_filetype type = cft->private;
-	char *page;
-	ssize_t retval = 0;
-	char *s;
-
-	if (!(page = (char *)__get_free_page(GFP_KERNEL)))
-		return -ENOMEM;
-
-	s = page;
-
-	switch (type) {
-	case FILE_RELEASE_AGENT:
-	{
-		struct cgroupfs_root *root;
-		size_t n;
-		mutex_lock(&cgroup_mutex);
-		root = cgrp->root;
-		n = strnlen(root->release_agent_path,
-			    sizeof(root->release_agent_path));
-		n = min(n, (size_t) PAGE_SIZE);
-		strncpy(s, root->release_agent_path, n);
-		mutex_unlock(&cgroup_mutex);
-		s += n;
-		break;
-	}
-	default:
-		retval = -EINVAL;
-		goto out;
-	}
-	*s++ = '\n';
-
-	retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
-out:
-	free_page((unsigned long)page);
-	return retval;
-}
-
 static ssize_t cgroup_file_read(struct file *file, char __user *buf,
 				   size_t nbytes, loff_t *ppos)
 {
@@ -1606,6 +1594,7 @@ int cgroup_seqfile_release(struct inode *inode, struct file *file)
 
 static struct file_operations cgroup_seqfile_operations = {
 	.read = seq_read,
+	.write = cgroup_file_write,
 	.llseek = seq_lseek,
 	.release = cgroup_seqfile_release,
 };
@@ -2283,8 +2272,9 @@ static struct cftype files[] = {
 
 static struct cftype cft_release_agent = {
 	.name = "release_agent",
-	.read = cgroup_common_file_read,
-	.write = cgroup_common_file_write,
+	.read_seq_string = cgroup_release_agent_show,
+	.write_string = cgroup_release_agent_write,
+	.max_write_len = PATH_MAX,
 	.private = FILE_RELEASE_AGENT,
 };
 
@@ -3111,27 +3101,24 @@ static void cgroup_release_agent(struct work_struct *work)
 	while (!list_empty(&release_list)) {
 		char *argv[3], *envp[3];
 		int i;
-		char *pathbuf;
+		char *pathbuf = NULL, *agentbuf = NULL;
 		struct cgroup *cgrp = list_entry(release_list.next,
 						    struct cgroup,
 						    release_list);
 		list_del_init(&cgrp->release_list);
 		spin_unlock(&release_list_lock);
 		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
-		if (!pathbuf) {
-			spin_lock(&release_list_lock);
-			continue;
-		}
-
-		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) {
-			kfree(pathbuf);
-			spin_lock(&release_list_lock);
-			continue;
-		}
+		if (!pathbuf)
+			goto continue_free;
+		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
+			goto continue_free;
+		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+		if (!agentbuf)
+			goto continue_free;
 
 		i = 0;
-		argv[i++] = cgrp->root->release_agent_path;
-		argv[i++] = (char *)pathbuf;
+		argv[i++] = agentbuf;
+		argv[i++] = pathbuf;
 		argv[i] = NULL;
 
 		i = 0;
@@ -3145,8 +3132,10 @@ static void cgroup_release_agent(struct work_struct *work)
 		 * be a slow process */
 		mutex_unlock(&cgroup_mutex);
 		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
-		kfree(pathbuf);
 		mutex_lock(&cgroup_mutex);
+ continue_free:
+		kfree(pathbuf);
+		kfree(agentbuf);
 		spin_lock(&release_list_lock);
 	}
 	spin_unlock(&release_list_lock);
-- 
cgit v1.1


From 84eea842886ac35020be6043e04748ed22014359 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Fri, 25 Jul 2008 01:47:00 -0700
Subject: cgroups: misc cleanups to write_string patchset

This patch contains cleanups suggested by reviewers for the recent
write_string() patchset:

- pair cgroup_lock_live_group() with cgroup_unlock() in cgroup.c for
  clarity, rather than directly unlocking cgroup_mutex.

- make the return type of cgroup_lock_live_group() a bool

- use a #define'd constant for the local buffer size in read/write functions

Signed-off-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0120b5d..a14122e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1329,10 +1329,10 @@ enum cgroup_filetype {
  * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
  * @cgrp: the cgroup to be checked for liveness
  *
- * Returns true (with lock held) on success, or false (with no lock
- * held) on failure.
+ * On success, returns true; the lock should be later released with
+ * cgroup_unlock(). On failure returns false with no lock held.
  */
-int cgroup_lock_live_group(struct cgroup *cgrp)
+bool cgroup_lock_live_group(struct cgroup *cgrp)
 {
 	mutex_lock(&cgroup_mutex);
 	if (cgroup_is_removed(cgrp)) {
@@ -1349,7 +1349,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
 	strcpy(cgrp->root->release_agent_path, buffer);
-	mutex_unlock(&cgroup_mutex);
+	cgroup_unlock();
 	return 0;
 }
 
@@ -1360,16 +1360,19 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
 		return -ENODEV;
 	seq_puts(seq, cgrp->root->release_agent_path);
 	seq_putc(seq, '\n');
-	mutex_unlock(&cgroup_mutex);
+	cgroup_unlock();
 	return 0;
 }
 
+/* A buffer size big enough for numbers or short strings */
+#define CGROUP_LOCAL_BUFFER_SIZE 64
+
 static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
 				struct file *file,
 				const char __user *userbuf,
 				size_t nbytes, loff_t *unused_ppos)
 {
-	char buffer[64];
+	char buffer[CGROUP_LOCAL_BUFFER_SIZE];
 	int retval = 0;
 	char *end;
 
@@ -1403,7 +1406,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
 				   const char __user *userbuf,
 				   size_t nbytes, loff_t *unused_ppos)
 {
-	char local_buffer[64];
+	char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
 	int retval = 0;
 	size_t max_bytes = cft->max_write_len;
 	char *buffer = local_buffer;
@@ -1518,7 +1521,7 @@ static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
 			       char __user *buf, size_t nbytes,
 			       loff_t *ppos)
 {
-	char tmp[64];
+	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
 	u64 val = cft->read_u64(cgrp, cft);
 	int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
 
@@ -1530,7 +1533,7 @@ static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
 			       char __user *buf, size_t nbytes,
 			       loff_t *ppos)
 {
-	char tmp[64];
+	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
 	s64 val = cft->read_s64(cgrp, cft);
 	int len = sprintf(tmp, "%lld\n", (long long) val);
 
-- 
cgit v1.1


From 6379c106152388f7ea45d6dda63edda0e9181fc8 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Fri, 25 Jul 2008 01:47:01 -0700
Subject: cgroup files: move notify_on_release file to separate write handler

This patch moves the write handler for the cgroups notify_on_release
file into a separate handler. This handler requires no cgroups locking
since it relies on atomic bitops for synchronization.

Signed-off-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a14122e..d597d30 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1474,13 +1474,6 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
 	case FILE_TASKLIST:
 		retval = attach_task_by_pid(cgrp, buffer);
 		break;
-	case FILE_NOTIFY_ON_RELEASE:
-		clear_bit(CGRP_RELEASABLE, &cgrp->flags);
-		if (simple_strtoul(buffer, NULL, 10) != 0)
-			set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-		else
-			clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-		break;
 	default:
 		retval = -EINVAL;
 		goto out2;
@@ -2252,6 +2245,18 @@ static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
 	return notify_on_release(cgrp);
 }
 
+static int cgroup_write_notify_on_release(struct cgroup *cgrp,
+					  struct cftype *cft,
+					  u64 val)
+{
+	clear_bit(CGRP_RELEASABLE, &cgrp->flags);
+	if (val)
+		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+	else
+		clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+	return 0;
+}
+
 /*
  * for the common functions, 'private' gives the type of file
  */
@@ -2268,7 +2273,7 @@ static struct cftype files[] = {
 	{
 		.name = "notify_on_release",
 		.read_u64 = cgroup_read_notify_on_release,
-		.write = cgroup_common_file_write,
+		.write_u64 = cgroup_write_notify_on_release,
 		.private = FILE_NOTIFY_ON_RELEASE,
 	},
 };
-- 
cgit v1.1


From af351026aafc8da16518a02b41c66d3e0c1cdef4 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Fri, 25 Jul 2008 01:47:01 -0700
Subject: cgroup files: turn attach_task_by_pid directly into a cgroup write
 handler

This patch changes attach_task_by_pid() to take a u64 rather than a
string; as a result it can be called directly as a control groups
write_u64 handler, and cgroup_common_file_write() can be removed.

Signed-off-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 80 ++++++++++-----------------------------------------------
 1 file changed, 14 insertions(+), 66 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d597d30..86b71e7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -504,10 +504,6 @@ static struct css_set *find_css_set(
  * knows that the cgroup won't be removed, as cgroup_rmdir()
  * needs that mutex.
  *
- * The cgroup_common_file_write handler for operations that modify
- * the cgroup hierarchy holds cgroup_mutex across the entire operation,
- * single threading all such cgroup modifications across the system.
- *
  * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
  * (usually) take cgroup_mutex.  These are the two most performance
  * critical pieces of code here.  The exception occurs on cgroup_exit(),
@@ -1279,18 +1275,14 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 }
 
 /*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with
- * cgroup_mutex, may take task_lock of task
+ * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
+ * held. May take task_lock of task
  */
-static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
 {
-	pid_t pid;
 	struct task_struct *tsk;
 	int ret;
 
-	if (sscanf(pidbuf, "%d", &pid) != 1)
-		return -EIO;
-
 	if (pid) {
 		rcu_read_lock();
 		tsk = find_task_by_vpid(pid);
@@ -1316,6 +1308,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
 	return ret;
 }
 
+static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
+{
+	int ret;
+	if (!cgroup_lock_live_group(cgrp))
+		return -ENODEV;
+	ret = attach_task_by_pid(cgrp, pid);
+	cgroup_unlock();
+	return ret;
+}
+
 /* The various types of files and directories in a cgroup file system */
 enum cgroup_filetype {
 	FILE_ROOT,
@@ -1434,60 +1436,6 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
 	return retval;
 }
 
-static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
-					   struct cftype *cft,
-					   struct file *file,
-					   const char __user *userbuf,
-					   size_t nbytes, loff_t *unused_ppos)
-{
-	enum cgroup_filetype type = cft->private;
-	char *buffer;
-	int retval = 0;
-
-	if (nbytes >= PATH_MAX)
-		return -E2BIG;
-
-	/* +1 for nul-terminator */
-	buffer = kmalloc(nbytes + 1, GFP_KERNEL);
-	if (buffer == NULL)
-		return -ENOMEM;
-
-	if (copy_from_user(buffer, userbuf, nbytes)) {
-		retval = -EFAULT;
-		goto out1;
-	}
-	buffer[nbytes] = 0;	/* nul-terminate */
-	strstrip(buffer);	/* strip -just- trailing whitespace */
-
-	mutex_lock(&cgroup_mutex);
-
-	/*
-	 * This was already checked for in cgroup_file_write(), but
-	 * check again now we're holding cgroup_mutex.
-	 */
-	if (cgroup_is_removed(cgrp)) {
-		retval = -ENODEV;
-		goto out2;
-	}
-
-	switch (type) {
-	case FILE_TASKLIST:
-		retval = attach_task_by_pid(cgrp, buffer);
-		break;
-	default:
-		retval = -EINVAL;
-		goto out2;
-	}
-
-	if (retval == 0)
-		retval = nbytes;
-out2:
-	mutex_unlock(&cgroup_mutex);
-out1:
-	kfree(buffer);
-	return retval;
-}
-
 static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
 						size_t nbytes, loff_t *ppos)
 {
@@ -2265,7 +2213,7 @@ static struct cftype files[] = {
 		.name = "tasks",
 		.open = cgroup_tasks_open,
 		.read = cgroup_tasks_read,
-		.write = cgroup_common_file_write,
+		.write_u64 = cgroup_tasks_write,
 		.release = cgroup_tasks_release,
 		.private = FILE_TASKLIST,
 	},
-- 
cgit v1.1


From e37123953292146445c8629b3950d0513fd10ae2 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Fri, 25 Jul 2008 01:47:02 -0700
Subject: cgroup files: remove cpuset_common_file_write()

This patch tweaks the signatures of the update_cpumask() and
update_nodemask() functions so that they can be called directly as
handlers for the new cgroups write_string() method.

This allows cpuset_common_file_write() to be removed.

Signed-off-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 109 ++++++++++++++++++--------------------------------------
 1 file changed, 35 insertions(+), 74 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d573891..276ce7e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -227,10 +227,6 @@ static struct cpuset top_cpuset = {
  * The task_struct fields mems_allowed and mems_generation may only
  * be accessed in the context of that task, so require no locks.
  *
- * The cpuset_common_file_write handler for operations that modify
- * the cpuset hierarchy holds cgroup_mutex across the entire operation,
- * single threading all such cpuset modifications across the system.
- *
  * The cpuset_common_file_read() handlers only hold callback_mutex across
  * small pieces of code, such as when reading out possibly multi-word
  * cpumasks and nodemasks.
@@ -772,7 +768,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
  * @cs: the cpuset to consider
  * @buf: buffer of cpu numbers written to this cpuset
  */
-static int update_cpumask(struct cpuset *cs, char *buf)
+static int update_cpumask(struct cpuset *cs, const char *buf)
 {
 	struct cpuset trialcs;
 	struct cgroup_scanner scan;
@@ -792,7 +788,6 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 	 * that parsing.  The validate_change() call ensures that cpusets
 	 * with tasks have cpus.
 	 */
-	buf = strstrip(buf);
 	if (!*buf) {
 		cpus_clear(trialcs.cpus_allowed);
 	} else {
@@ -902,7 +897,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 
 static void *cpuset_being_rebound;
 
-static int update_nodemask(struct cpuset *cs, char *buf)
+static int update_nodemask(struct cpuset *cs, const char *buf)
 {
 	struct cpuset trialcs;
 	nodemask_t oldmem;
@@ -929,7 +924,6 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	 * that parsing.  The validate_change() call ensures that cpusets
 	 * with tasks have memory.
 	 */
-	buf = strstrip(buf);
 	if (!*buf) {
 		nodes_clear(trialcs.mems_allowed);
 	} else {
@@ -1256,72 +1250,14 @@ typedef enum {
 	FILE_SPREAD_SLAB,
 } cpuset_filetype_t;
 
-static ssize_t cpuset_common_file_write(struct cgroup *cont,
-					struct cftype *cft,
-					struct file *file,
-					const char __user *userbuf,
-					size_t nbytes, loff_t *unused_ppos)
-{
-	struct cpuset *cs = cgroup_cs(cont);
-	cpuset_filetype_t type = cft->private;
-	char *buffer;
-	int retval = 0;
-
-	/* Crude upper limit on largest legitimate cpulist user might write. */
-	if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES))
-		return -E2BIG;
-
-	/* +1 for nul-terminator */
-	buffer = kmalloc(nbytes + 1, GFP_KERNEL);
-	if (!buffer)
-		return -ENOMEM;
-
-	if (copy_from_user(buffer, userbuf, nbytes)) {
-		retval = -EFAULT;
-		goto out1;
-	}
-	buffer[nbytes] = 0;	/* nul-terminate */
-
-	cgroup_lock();
-
-	if (cgroup_is_removed(cont)) {
-		retval = -ENODEV;
-		goto out2;
-	}
-
-	switch (type) {
-	case FILE_CPULIST:
-		retval = update_cpumask(cs, buffer);
-		break;
-	case FILE_MEMLIST:
-		retval = update_nodemask(cs, buffer);
-		break;
-	default:
-		retval = -EINVAL;
-		goto out2;
-	}
-
-	if (retval == 0)
-		retval = nbytes;
-out2:
-	cgroup_unlock();
-out1:
-	kfree(buffer);
-	return retval;
-}
-
 static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 {
 	int retval = 0;
 	struct cpuset *cs = cgroup_cs(cgrp);
 	cpuset_filetype_t type = cft->private;
 
-	cgroup_lock();
-
-	if (cgroup_is_removed(cgrp)) {
-		cgroup_unlock();
+	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
-	}
 
 	switch (type) {
 	case FILE_CPU_EXCLUSIVE:
@@ -1367,12 +1303,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
 	struct cpuset *cs = cgroup_cs(cgrp);
 	cpuset_filetype_t type = cft->private;
 
-	cgroup_lock();
-
-	if (cgroup_is_removed(cgrp)) {
-		cgroup_unlock();
+	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
-	}
+
 	switch (type) {
 	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
 		retval = update_relax_domain_level(cs, val);
@@ -1386,6 +1319,32 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
 }
 
 /*
+ * Common handling for a write to a "cpus" or "mems" file.
+ */
+static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
+				const char *buf)
+{
+	int retval = 0;
+
+	if (!cgroup_lock_live_group(cgrp))
+		return -ENODEV;
+
+	switch (cft->private) {
+	case FILE_CPULIST:
+		retval = update_cpumask(cgroup_cs(cgrp), buf);
+		break;
+	case FILE_MEMLIST:
+		retval = update_nodemask(cgroup_cs(cgrp), buf);
+		break;
+	default:
+		retval = -EINVAL;
+		break;
+	}
+	cgroup_unlock();
+	return retval;
+}
+
+/*
  * These ascii lists should be read in a single call, by using a user
  * buffer large enough to hold the entire map.  If read in smaller
  * chunks, there is no guarantee of atomicity.  Since the display format
@@ -1504,14 +1463,16 @@ static struct cftype files[] = {
 	{
 		.name = "cpus",
 		.read = cpuset_common_file_read,
-		.write = cpuset_common_file_write,
+		.write_string = cpuset_write_resmask,
+		.max_write_len = (100U + 6 * NR_CPUS),
 		.private = FILE_CPULIST,
 	},
 
 	{
 		.name = "mems",
 		.read = cpuset_common_file_read,
-		.write = cpuset_common_file_write,
+		.write_string = cpuset_write_resmask,
+		.max_write_len = (100U + 6 * MAX_NUMNODES),
 		.private = FILE_MEMLIST,
 	},
 
-- 
cgit v1.1


From 856c13aa1ff6136c1968414fdea5938ea9d5ebf2 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Fri, 25 Jul 2008 01:47:04 -0700
Subject: cgroup files: convert res_counter_write() to be a cgroups
 write_string() handler

Currently res_counter_write() is a raw file handler even though it's
ultimately taking a number, since in some cases it wants to
pre-process the string when converting it to a number.

This patch converts res_counter_write() from a raw file handler to a
write_string() handler; this allows some of the boilerplate
copying/locking/checking to be removed, and simplies the cleanup path,
since these functions are now performed by the cgroups framework.

[lizf@cn.fujitsu.com: build fix]
Signed-off-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/res_counter.c | 48 +++++++++++++++++++++---------------------------
 1 file changed, 21 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index d3c61b4..f275c8e 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/res_counter.h>
 #include <linux/uaccess.h>
+#include <linux/mm.h>
 
 void res_counter_init(struct res_counter *counter)
 {
@@ -102,44 +103,37 @@ u64 res_counter_read_u64(struct res_counter *counter, int member)
 	return *res_counter_member(counter, member);
 }
 
-ssize_t res_counter_write(struct res_counter *counter, int member,
-		const char __user *userbuf, size_t nbytes, loff_t *pos,
-		int (*write_strategy)(char *st_buf, unsigned long long *val))
+int res_counter_memparse_write_strategy(const char *buf,
+					unsigned long long *res)
 {
-	int ret;
-	char *buf, *end;
-	unsigned long flags;
-	unsigned long long tmp, *val;
-
-	buf = kmalloc(nbytes + 1, GFP_KERNEL);
-	ret = -ENOMEM;
-	if (buf == NULL)
-		goto out;
+	char *end;
+	/* FIXME - make memparse() take const char* args */
+	*res = memparse((char *)buf, &end);
+	if (*end != '\0')
+		return -EINVAL;
 
-	buf[nbytes] = '\0';
-	ret = -EFAULT;
-	if (copy_from_user(buf, userbuf, nbytes))
-		goto out_free;
+	*res = PAGE_ALIGN(*res);
+	return 0;
+}
 
-	ret = -EINVAL;
+int res_counter_write(struct res_counter *counter, int member,
+		      const char *buf, write_strategy_fn write_strategy)
+{
+	char *end;
+	unsigned long flags;
+	unsigned long long tmp, *val;
 
-	strstrip(buf);
 	if (write_strategy) {
-		if (write_strategy(buf, &tmp)) {
-			goto out_free;
-		}
+		if (write_strategy(buf, &tmp))
+			return -EINVAL;
 	} else {
 		tmp = simple_strtoull(buf, &end, 10);
 		if (*end != '\0')
-			goto out_free;
+			return -EINVAL;
 	}
 	spin_lock_irqsave(&counter->lock, flags);
 	val = res_counter_member(counter, member);
 	*val = tmp;
 	spin_unlock_irqrestore(&counter->lock, flags);
-	ret = nbytes;
-out_free:
-	kfree(buf);
-out:
-	return ret;
+	return 0;
 }
-- 
cgit v1.1


From e885dcde75685e09f23cffae1f6d5169c105b8a0 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Fri, 25 Jul 2008 01:47:06 -0700
Subject: cgroup_clone: use pid of newly created task for new cgroup

cgroup_clone creates a new cgroup with the pid of the task.  This works
correctly for unshare, but for clone cgroup_clone is called from
copy_namespaces inside copy_process, which happens before the new pid is
created.  As a result, the new cgroup was created with current's pid.
This patch:

	1. Moves the call inside copy_process to after the new pid
	   is created
	2. Passes the struct pid into ns_cgroup_clone (as it is not
	   yet attached to the task)
	3. Passes a name from ns_cgroup_clone() into cgroup_clone()
	   so as to keep cgroup_clone() itself simpler
	4. Uses pid_vnr() to get the process id value, so that the
	   pid used to name the new cgroup is always the pid as it
	   would be known to the task which did the cloning or
	   unsharing.  I think that is the most intuitive thing to
	   do.  This way, task t1 does clone(CLONE_NEWPID) to get
	   t2, which does clone(CLONE_NEWPID) to get t3, then the
	   cgroup for t3 will be named for the pid by which t2 knows
	   t3.

(Thanks to Dan Smith for finding the main bug)

Changelog:
	June 11: Incorporate Paul Menage's feedback:  don't pass
	         NULL to ns_cgroup_clone from unshare, and reduce
		 patch size by using 'nodename' in cgroup_clone.
	June 10: Original version

[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Serge Hallyn <serge@us.ibm.com>
Acked-by: Paul Menage <menage@google.com>
Tested-by: Dan Smith <danms@us.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c    | 7 +++----
 kernel/fork.c      | 6 ++++++
 kernel/ns_cgroup.c | 8 ++++++--
 kernel/nsproxy.c   | 8 +-------
 4 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 86b71e7..66ec9fd 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2848,16 +2848,17 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
  * cgroup_clone - clone the cgroup the given subsystem is attached to
  * @tsk: the task to be moved
  * @subsys: the given subsystem
+ * @nodename: the name for the new cgroup
  *
  * Duplicate the current cgroup in the hierarchy that the given
  * subsystem is attached to, and move this task into the new
  * child.
  */
-int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
+int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
+							char *nodename)
 {
 	struct dentry *dentry;
 	int ret = 0;
-	char nodename[MAX_CGROUP_TYPE_NAMELEN];
 	struct cgroup *parent, *child;
 	struct inode *inode;
 	struct css_set *cg;
@@ -2882,8 +2883,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
 	cg = tsk->cgroups;
 	parent = task_cgroup(tsk, subsys->subsys_id);
 
-	snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "%d", tsk->pid);
-
 	/* Pin the hierarchy */
 	atomic_inc(&parent->root->sb->s_active);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 5a5d6fe..228f80c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1107,6 +1107,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (clone_flags & CLONE_THREAD)
 		p->tgid = current->tgid;
 
+	if (current->nsproxy != p->nsproxy) {
+		retval = ns_cgroup_clone(p, pid);
+		if (retval)
+			goto bad_fork_free_pid;
+	}
+
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
 	/*
 	 * Clear TID on mm_release()?
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 48d7ed6..43c2111 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -7,6 +7,7 @@
 #include <linux/module.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
+#include <linux/proc_fs.h>
 #include <linux/slab.h>
 #include <linux/nsproxy.h>
 
@@ -24,9 +25,12 @@ static inline struct ns_cgroup *cgroup_to_ns(
 			    struct ns_cgroup, css);
 }
 
-int ns_cgroup_clone(struct task_struct *task)
+int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
 {
-	return cgroup_clone(task, &ns_subsys);
+	char name[PROC_NUMBUF];
+
+	snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
+	return cgroup_clone(task, &ns_subsys, name);
 }
 
 /*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index adc7851..21575fc 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -157,12 +157,6 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 		goto out;
 	}
 
-	err = ns_cgroup_clone(tsk);
-	if (err) {
-		put_nsproxy(new_ns);
-		goto out;
-	}
-
 	tsk->nsproxy = new_ns;
 
 out:
@@ -209,7 +203,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 		goto out;
 	}
 
-	err = ns_cgroup_clone(current);
+	err = ns_cgroup_clone(current, task_pid(current));
 	if (err)
 		put_nsproxy(*new_nsp);
 
-- 
cgit v1.1


From 0b2f630a28d53b5a2082a5275bc3334b10373508 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:47:21 -0700
Subject: cpusets: restructure the function update_cpumask() and
 update_nodemask()

Extract two functions from update_cpumask() and update_nodemask().They
will be used later for updating tasks' cpus_allowed and mems_allowed after
CPU/NODE offline/online.

[lizf@cn.fujitsu.com: build fix]
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc:  Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 181 ++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 109 insertions(+), 72 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 276ce7e..7326d51 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -764,6 +764,37 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
 }
 
 /**
+ * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ *
+ * Called with cgroup_mutex held
+ *
+ * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
+ * calling callback functions for each.
+ *
+ * Return 0 if successful, -errno if not.
+ */
+static int update_tasks_cpumask(struct cpuset *cs)
+{
+	struct cgroup_scanner scan;
+	struct ptr_heap heap;
+	int retval;
+
+	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
+	if (retval)
+		return retval;
+
+	scan.cg = cs->css.cgroup;
+	scan.test_task = cpuset_test_cpumask;
+	scan.process_task = cpuset_change_cpumask;
+	scan.heap = &heap;
+	retval = cgroup_scan_tasks(&scan);
+
+	heap_free(&heap);
+	return retval;
+}
+
+/**
  * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
  * @cs: the cpuset to consider
  * @buf: buffer of cpu numbers written to this cpuset
@@ -771,8 +802,6 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
 static int update_cpumask(struct cpuset *cs, const char *buf)
 {
 	struct cpuset trialcs;
-	struct cgroup_scanner scan;
-	struct ptr_heap heap;
 	int retval;
 	int is_load_balanced;
 
@@ -806,10 +835,6 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
 	if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
 		return 0;
 
-	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
-	if (retval)
-		return retval;
-
 	is_load_balanced = is_sched_load_balance(&trialcs);
 
 	mutex_lock(&callback_mutex);
@@ -820,12 +845,9 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
 	 * Scan tasks in the cpuset, and update the cpumasks of any
 	 * that need an update.
 	 */
-	scan.cg = cs->css.cgroup;
-	scan.test_task = cpuset_test_cpumask;
-	scan.process_task = cpuset_change_cpumask;
-	scan.heap = &heap;
-	cgroup_scan_tasks(&scan);
-	heap_free(&heap);
+	retval = update_tasks_cpumask(cs);
+	if (retval < 0)
+		return retval;
 
 	if (is_load_balanced)
 		rebuild_sched_domains();
@@ -881,73 +903,25 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 	mutex_unlock(&callback_mutex);
 }
 
-/*
- * Handle user request to change the 'mems' memory placement
- * of a cpuset.  Needs to validate the request, update the
- * cpusets mems_allowed and mems_generation, and for each
- * task in the cpuset, rebind any vma mempolicies and if
- * the cpuset is marked 'memory_migrate', migrate the tasks
- * pages to the new memory.
- *
- * Call with cgroup_mutex held.  May take callback_mutex during call.
- * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
- * lock each such tasks mm->mmap_sem, scan its vma's and rebind
- * their mempolicies to the cpusets new mems_allowed.
- */
-
 static void *cpuset_being_rebound;
 
-static int update_nodemask(struct cpuset *cs, const char *buf)
+/**
+ * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
+ * @oldmem: old mems_allowed of cpuset cs
+ *
+ * Called with cgroup_mutex held
+ * Return 0 if successful, -errno if not.
+ */
+static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 {
-	struct cpuset trialcs;
-	nodemask_t oldmem;
 	struct task_struct *p;
 	struct mm_struct **mmarray;
 	int i, n, ntasks;
 	int migrate;
 	int fudge;
-	int retval;
 	struct cgroup_iter it;
-
-	/*
-	 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
-	 * it's read-only
-	 */
-	if (cs == &top_cpuset)
-		return -EACCES;
-
-	trialcs = *cs;
-
-	/*
-	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
-	 * Since nodelist_parse() fails on an empty mask, we special case
-	 * that parsing.  The validate_change() call ensures that cpusets
-	 * with tasks have memory.
-	 */
-	if (!*buf) {
-		nodes_clear(trialcs.mems_allowed);
-	} else {
-		retval = nodelist_parse(buf, trialcs.mems_allowed);
-		if (retval < 0)
-			goto done;
-
-		if (!nodes_subset(trialcs.mems_allowed,
-				node_states[N_HIGH_MEMORY]))
-			return -EINVAL;
-	}
-	oldmem = cs->mems_allowed;
-	if (nodes_equal(oldmem, trialcs.mems_allowed)) {
-		retval = 0;		/* Too easy - nothing to do */
-		goto done;
-	}
-	retval = validate_change(cs, &trialcs);
-	if (retval < 0)
-		goto done;
-
-	mutex_lock(&callback_mutex);
-	cs->mems_allowed = trialcs.mems_allowed;
-	cs->mems_generation = cpuset_mems_generation++;
-	mutex_unlock(&callback_mutex);
+	int retval;
 
 	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
 
@@ -1014,7 +988,7 @@ static int update_nodemask(struct cpuset *cs, const char *buf)
 
 		mpol_rebind_mm(mm, &cs->mems_allowed);
 		if (migrate)
-			cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
+			cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
 		mmput(mm);
 	}
 
@@ -1026,6 +1000,70 @@ done:
 	return retval;
 }
 
+/*
+ * Handle user request to change the 'mems' memory placement
+ * of a cpuset.  Needs to validate the request, update the
+ * cpusets mems_allowed and mems_generation, and for each
+ * task in the cpuset, rebind any vma mempolicies and if
+ * the cpuset is marked 'memory_migrate', migrate the tasks
+ * pages to the new memory.
+ *
+ * Call with cgroup_mutex held.  May take callback_mutex during call.
+ * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
+ * lock each such tasks mm->mmap_sem, scan its vma's and rebind
+ * their mempolicies to the cpusets new mems_allowed.
+ */
+static int update_nodemask(struct cpuset *cs, const char *buf)
+{
+	struct cpuset trialcs;
+	nodemask_t oldmem;
+	int retval;
+
+	/*
+	 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
+	 * it's read-only
+	 */
+	if (cs == &top_cpuset)
+		return -EACCES;
+
+	trialcs = *cs;
+
+	/*
+	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
+	 * Since nodelist_parse() fails on an empty mask, we special case
+	 * that parsing.  The validate_change() call ensures that cpusets
+	 * with tasks have memory.
+	 */
+	if (!*buf) {
+		nodes_clear(trialcs.mems_allowed);
+	} else {
+		retval = nodelist_parse(buf, trialcs.mems_allowed);
+		if (retval < 0)
+			goto done;
+
+		if (!nodes_subset(trialcs.mems_allowed,
+				node_states[N_HIGH_MEMORY]))
+			return -EINVAL;
+	}
+	oldmem = cs->mems_allowed;
+	if (nodes_equal(oldmem, trialcs.mems_allowed)) {
+		retval = 0;		/* Too easy - nothing to do */
+		goto done;
+	}
+	retval = validate_change(cs, &trialcs);
+	if (retval < 0)
+		goto done;
+
+	mutex_lock(&callback_mutex);
+	cs->mems_allowed = trialcs.mems_allowed;
+	cs->mems_generation = cpuset_mems_generation++;
+	mutex_unlock(&callback_mutex);
+
+	retval = update_tasks_nodemask(cs, &oldmem);
+done:
+	return retval;
+}
+
 int current_cpuset_is_being_rebound(void)
 {
 	return task_cs(current) == cpuset_being_rebound;
@@ -1935,7 +1973,6 @@ void __init cpuset_init_smp(void)
 }
 
 /**
-
  * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
  * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
  * @pmask: pointer to cpumask_t variable to receive cpus_allowed set.
-- 
cgit v1.1


From f9b4fb8dabf38fb456c97f01aace07cb6e7c1723 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:47:22 -0700
Subject: cpusets: update task's cpus_allowed and mems_allowed after CPU/NODE
 offline/online

The bug is that a task may run on the cpu/node which is not in its
cpuset.cpus/ cpuset.mems.

It can be reproduced by the following commands:
-----------------------------------
# mkdir /dev/cpuset
# mount -t cpuset xxx /dev/cpuset
# mkdir /dev/cpuset/0
# echo 0-1 > /dev/cpuset/0/cpus
# echo 0 > /dev/cpuset/0/mems
# echo $$ > /dev/cpuset/0/tasks
# echo 0 > /sys/devices/system/cpu/cpu1/online
# echo 1 > /sys/devices/system/cpu/cpu1/online
-----------------------------------

There is only CPU0 in cpuset.cpus, but the task in this cpuset runs on
both CPU0 and CPU1.

It is because the task's cpu_allowed didn't get updated after we did CPU
offline/online manipulation.  Similar for mem_allowed.

This patch fixes this bug expect for root cpuset.  Because there is a
problem about root cpuset, in that whether it is necessary to update all
the tasks in root cpuset or not after cpu/node offline/online.

If updating, some kernel threads which is bound into a specified cpu will
be unbound.

If not updating, there is a bug in root cpuset.  This bug is also caused
by offline/online manipulation.  For example, there is a dual-cpu machine.
 we create a sub cpuset in root cpuset and assign 1 to its cpus.  And then
we attach some tasks into this sub cpuset.  After this, we offline CPU1.
Now, the tasks in this new cpuset are moved into root cpuset automatically
because there is no cpu in sub cpuset.  Then we online CPU1, we find all
the tasks which doesn't belong to root cpuset originally just run on CPU0.

Maybe we need to add a flag in the task_struct to mark which task can't be
unbound?

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Paul Menage <menage@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7326d51..6eae663 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1851,6 +1851,7 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
 	struct cpuset *child;	/* scans child cpusets of cp */
 	struct list_head queue;
 	struct cgroup *cont;
+	nodemask_t oldmems;
 
 	INIT_LIST_HEAD(&queue);
 
@@ -1870,6 +1871,8 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
 		    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
 			continue;
 
+		oldmems = cp->mems_allowed;
+
 		/* Remove offline cpus and mems from this cpuset. */
 		mutex_lock(&callback_mutex);
 		cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
@@ -1881,6 +1884,10 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
 		if (cpus_empty(cp->cpus_allowed) ||
 		     nodes_empty(cp->mems_allowed))
 			remove_tasks_in_empty_cpuset(cp);
+		else {
+			update_tasks_cpumask(cp);
+			update_tasks_nodemask(cp, &oldmems);
+		}
 	}
 }
 
-- 
cgit v1.1


From c372e817afc629fea9ff6321313325ed0b4a855b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:47:23 -0700
Subject: cpuset: avoid unnecessary sched domains rebuilding

When changing 'sched_relax_domain_level', don't rebuild sched domains if
'cpus' is empty or 'sched_load_balance' is not set.

Also make the comments of rebuild_sched_domains() more readable.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6eae663..60d2c47 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -496,11 +496,16 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
 /*
  * rebuild_sched_domains()
  *
- * If the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
- * which has that flag enabled, or if any cpuset with a non-empty
- * 'cpus' is removed, then call this routine to rebuild the
- * scheduler's dynamic sched domains.
+ * This routine will be called to rebuild the scheduler's dynamic
+ * sched domains:
+ * - if the flag 'sched_load_balance' of any cpuset with non-empty
+ *   'cpus' changes,
+ * - or if the 'cpus' allowed changes in any cpuset which has that
+ *   flag enabled,
+ * - or if the 'sched_relax_domain_level' of any cpuset which has
+ *   that flag enabled and with non-empty 'cpus' changes,
+ * - or if any cpuset with non-empty 'cpus' is removed,
+ * - or if a cpu gets offlined.
  *
  * This routine builds a partial partition of the systems CPUs
  * (the set of non-overlappping cpumask_t's in the array 'part'
@@ -1076,7 +1081,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 
 	if (val != cs->relax_domain_level) {
 		cs->relax_domain_level = val;
-		rebuild_sched_domains();
+		if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
+			rebuild_sched_domains();
 	}
 
 	return 0;
-- 
cgit v1.1


From 489a5393a20dcbf91104052120eb2eff8791b61b Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:47:23 -0700
Subject: cpuset: don't pass empty cpumasks to partition_sched_domains()

I create lots of empty cpusets(empty cpumasks) and turn off the
"sched_load_balance" in top cpuset.

I found that all these empty cpumasks are passed to
partition_sched_domains() in rebuild_sched_domains(), it's very
time-consuming for partition_sched_domains() and it's not need.

It also reduce memory consumed and some works in rebuild_sched_domains()
too.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 60d2c47..531b235 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -610,8 +610,13 @@ void rebuild_sched_domains(void)
 	while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
 		struct cgroup *cont;
 		struct cpuset *child;   /* scans child cpusets of cp */
+
+		if (cpus_empty(cp->cpus_allowed))
+			continue;
+
 		if (is_sched_load_balance(cp))
 			csa[csn++] = cp;
+
 		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
 			child = cgroup_cs(cont);
 			__kfifo_put(q, (void *)&child, sizeof(cp));
-- 
cgit v1.1


From 02412483777651a26b19a75e49c2a451a174ca9c Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:47:24 -0700
Subject: cpuset: code-cleanup for started_after

cgroup(cgroup_scan_tasks) will initialize heap->gt for us.  This patch
removes started_after() and its helper-function.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 37 ++++++-------------------------------
 1 file changed, 6 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 531b235..ebbc9b0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -709,36 +709,6 @@ done:
 	/* Don't kfree(dattr) -- partition_sched_domains() does that. */
 }
 
-static inline int started_after_time(struct task_struct *t1,
-				     struct timespec *time,
-				     struct task_struct *t2)
-{
-	int start_diff = timespec_compare(&t1->start_time, time);
-	if (start_diff > 0) {
-		return 1;
-	} else if (start_diff < 0) {
-		return 0;
-	} else {
-		/*
-		 * Arbitrarily, if two processes started at the same
-		 * time, we'll say that the lower pointer value
-		 * started first. Note that t2 may have exited by now
-		 * so this may not be a valid pointer any longer, but
-		 * that's fine - it still serves to distinguish
-		 * between two tasks started (effectively)
-		 * simultaneously.
-		 */
-		return t1 > t2;
-	}
-}
-
-static inline int started_after(void *p1, void *p2)
-{
-	struct task_struct *t1 = p1;
-	struct task_struct *t2 = p2;
-	return started_after_time(t1, &t2->start_time, t2);
-}
-
 /**
  * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
  * @tsk: task to test
@@ -790,7 +760,12 @@ static int update_tasks_cpumask(struct cpuset *cs)
 	struct ptr_heap heap;
 	int retval;
 
-	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
+	/*
+	 * cgroup_scan_tasks() will initialize heap->gt for us.
+	 * heap_init() is still needed here for we should not change
+	 * cs->cpus_allowed when heap_init() fails.
+	 */
+	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
 	if (retval)
 		return retval;
 
-- 
cgit v1.1


From da5ef6bb96158b0fc0d808704237a453af449124 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:47:25 -0700
Subject: cpuset: two minor code-cleanups

In cpuset_update_task_memory_state() local variable struct task_struct
*tsk = current;

And local variable tsk is used 14 times and statement task_cs(tsk) is used
twice in this function.  So using task_cs(tsk) instead of task_cs(current)
is better for readability.

And "(struct cgroup_scanner *)&scan" is not good for readability also.
(and "container_of" is used in cpuset_do_move_task(), not
"(cpuset_hotplug_scanner *)scan")

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ebbc9b0..91cf85b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -365,7 +365,7 @@ void cpuset_update_task_memory_state(void)
 		my_cpusets_mem_gen = top_cpuset.mems_generation;
 	} else {
 		rcu_read_lock();
-		my_cpusets_mem_gen = task_cs(current)->mems_generation;
+		my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
 		rcu_read_unlock();
 	}
 
@@ -1777,7 +1777,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
 	scan.scan.heap = NULL;
 	scan.to = to->css.cgroup;
 
-	if (cgroup_scan_tasks((struct cgroup_scanner *)&scan))
+	if (cgroup_scan_tasks(&scan.scan))
 		printk(KERN_ERR "move_member_tasks_to_cpuset: "
 				"cgroup_scan_tasks failed\n");
 }
-- 
cgit v1.1


From 4b7a1304267bff68260ae861784b27130e805be3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:26 -0700
Subject: posix timers: timer_delete: remove the bogus "->it_process != NULL"
 check

sys_timer_delete() and itimer_delete() check "timer->it_process != NULL",
this looks completely bogus.  ->it_process == NULL means that this timer
is already under destruction or it is not fully initialized, this must not
happen.

	sys_timer_delete: the timer is locked, and lock_timer() can't succeed
	if ->it_process == NULL.

	itimer_delete: it is called by exit_itimers() when there are no other
	threads which can play with signal_struct->posix_timers.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/posix-timers.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index dbd8398..17f5326 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -856,11 +856,10 @@ retry_delete:
 	 * This keeps any tasks waiting on the spin lock from thinking
 	 * they got something (see the lock code above).
 	 */
-	if (timer->it_process) {
-		if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
-			put_task_struct(timer->it_process);
-		timer->it_process = NULL;
-	}
+	if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+		put_task_struct(timer->it_process);
+	timer->it_process = NULL;
+
 	unlock_timer(timer, flags);
 	release_posix_timer(timer, IT_ID_SET);
 	return 0;
@@ -885,11 +884,10 @@ retry_delete:
 	 * This keeps any tasks waiting on the spin lock from thinking
 	 * they got something (see the lock code above).
 	 */
-	if (timer->it_process) {
-		if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
-			put_task_struct(timer->it_process);
-		timer->it_process = NULL;
-	}
+	if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+		put_task_struct(timer->it_process);
+	timer->it_process = NULL;
+
 	unlock_timer(timer, flags);
 	release_posix_timer(timer, IT_ID_SET);
 }
-- 
cgit v1.1


From 96347e7759e2e433c427defa0fa1adfc8cce6226 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:27 -0700
Subject: posix timers: release_posix_timer: kill the bogus
 put_task_struct(->it_process);

release_posix_timer() can't be called with ->it_process != NULL.  Once
sys_timer_create() sets ->it_process it must not call
release_posix_timer(), otherwise we can race with another thread doing
sys_timer_delete(), this timer is visible to idr_find() and unlocked.

The same is true for two other callers (actually, for any possible
caller), sys_timer_delete() and itimer_delete().  They must clear
->it_process before unlock_timer() + release_posix_timer().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/posix-timers.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 17f5326..9a21681 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -449,9 +449,6 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 		spin_unlock_irqrestore(&idr_lock, flags);
 	}
 	sigqueue_free(tmr->sigq);
-	if (unlikely(tmr->it_process) &&
-	    tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
-		put_task_struct(tmr->it_process);
 	kmem_cache_free(posix_timers_cache, tmr);
 }
 
-- 
cgit v1.1


From 6715ca451cfff1c9ce4b33ad9918a1dacf43997c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:27 -0700
Subject: signals: collect_signal: remove the unneeded sigismember() check

collect_signal() checks sigismember(&list->signal, sig), this is not
needed.  This "sig" was just found by next_signal(), so it must be valid.

We have a (completely broken) call to ->notifier in between, but it must
not play with sigpending->signal bits or unlock ->siglock.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 6c0958e..c5b9aab 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -343,9 +343,6 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
 	struct sigqueue *q, *first = NULL;
 	int still_pending = 0;
 
-	if (unlikely(!sigismember(&list->signal, sig)))
-		return 0;
-
 	/*
 	 * Collect the siginfo appropriate to this signal.  Check if
 	 * there is another siginfo for the same signal.
-- 
cgit v1.1


From d4434207616980885205c605697868c0f07e4378 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:28 -0700
Subject: signals: collect_signal: simplify the "still_pending" logic

Factor out sigdelset() calls and remove the "still_pending" variable.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index c5b9aab..50ad439 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -341,7 +341,6 @@ unblock_all_signals(void)
 static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
 {
 	struct sigqueue *q, *first = NULL;
-	int still_pending = 0;
 
 	/*
 	 * Collect the siginfo appropriate to this signal.  Check if
@@ -349,26 +348,24 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
 	*/
 	list_for_each_entry(q, &list->list, list) {
 		if (q->info.si_signo == sig) {
-			if (first) {
-				still_pending = 1;
-				break;
-			}
+			if (first)
+				goto still_pending;
 			first = q;
 		}
 	}
+
+	sigdelset(&list->signal, sig);
+
 	if (first) {
+still_pending:
 		list_del_init(&first->list);
 		copy_siginfo(info, &first->info);
 		__sigqueue_free(first);
-		if (!still_pending)
-			sigdelset(&list->signal, sig);
 	} else {
-
 		/* Ok, it wasn't in the queue.  This must be
 		   a fast-pathed signal or we must have been
 		   out of queue space.  So zero out the info.
 		 */
-		sigdelset(&list->signal, sig);
 		info->si_signo = sig;
 		info->si_errno = 0;
 		info->si_code = 0;
-- 
cgit v1.1


From 100360f03077663b7bef3af44805b6cf700c3bee Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:29 -0700
Subject: signals: change collect_signal() to return void

With the recent changes collect_signal() always returns true.  Change it
to return void and update the single caller.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 50ad439..fea236f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -338,7 +338,7 @@ unblock_all_signals(void)
 	spin_unlock_irqrestore(&current->sighand->siglock, flags);
 }
 
-static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
+static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
 {
 	struct sigqueue *q, *first = NULL;
 
@@ -372,7 +372,6 @@ still_pending:
 		info->si_pid = 0;
 		info->si_uid = 0;
 	}
-	return 1;
 }
 
 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
@@ -390,8 +389,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 			}
 		}
 
-		if (!collect_signal(sig, pending, info))
-			sig = 0;
+		collect_signal(sig, pending, info);
 	}
 
 	return sig;
-- 
cgit v1.1


From 3854a771821c970065e3203a0b40ddc4101538cc Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:29 -0700
Subject: __exit_signal: don't take rcu lock

There is no reason for rcu_read_lock() in __exit_signal().  tsk->sighand
can only be changed if tsk does exec, obviously this is not possible.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 93d2711..a7799d8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -85,7 +85,6 @@ static void __exit_signal(struct task_struct *tsk)
 	BUG_ON(!sig);
 	BUG_ON(!atomic_read(&sig->count));
 
-	rcu_read_lock();
 	sighand = rcu_dereference(tsk->sighand);
 	spin_lock(&sighand->siglock);
 
@@ -136,7 +135,6 @@ static void __exit_signal(struct task_struct *tsk)
 	tsk->signal = NULL;
 	tsk->sighand = NULL;
 	spin_unlock(&sighand->siglock);
-	rcu_read_unlock();
 
 	__cleanup_sighand(sighand);
 	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
-- 
cgit v1.1


From 92413d771e7123304fb4b9efd2a00cccc946e383 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:30 -0700
Subject: signals: dequeue_signal: don't check SIGNAL_GROUP_EXIT when setting
 SIGNAL_STOP_DEQUEUED

dequeue_signal() checks SIGNAL_GROUP_EXIT before setting
SIGNAL_STOP_DEQUEUED.  This was added by
788e05a67c343fa22f2ae1d3ca264e7f15c25eaf a long ago to avoid the
coredump/SIGSTOP race.

Since then the related code was changed, and now this subtle check is both
incomplete and unneeded at the same time.  It is incomplete because
nowadays exec() doesn't set SIGNAL_GROUP_EXIT, so in fact we should check
signal_group_exit() to avoid a similar race.  Fortunately, we doesn't need
the check at all.  The only function which relies on SIGNAL_STOP_DEQUEUED
is do_signal_stop(), and it ignores this flag if signal_group_exit() == T,
this covers the SIGNAL_GROUP_EXIT case.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index fea236f..15f901a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -454,8 +454,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 		 * is to alert stop-signal processing code when another
 		 * processor has come along and cleared the flag.
 		 */
-		if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
-			tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
+		tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
 	}
 	if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
 		/*
-- 
cgit v1.1


From 2b201a9eddf509e8e935b45e573648e36f4b623f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:31 -0700
Subject: signals: do_signal_stop: kill the SIGNAL_UNKILLABLE check

fae5fa44f1fd079ffbed8e0add929dd7bbd1347f changed do_signal_stop() to check
SIGNAL_UNKILLABLE, this wasn't needed.  If signal_group_exit() == F, the
signal sent to SIGNAL_UNKILLABLE task must be already filtered out by the
caller, get_signal_to_deliver().  And if signal_group_exit() == T we are
not going to stop.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 15f901a..0514da5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1649,8 +1649,7 @@ static int do_signal_stop(int signr)
 	} else {
 		struct task_struct *t;
 
-		if (unlikely((sig->flags & (SIGNAL_STOP_DEQUEUED | SIGNAL_UNKILLABLE))
-					 != SIGNAL_STOP_DEQUEUED) ||
+		if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
 		    unlikely(signal_group_exit(sig)))
 			return 0;
 		/*
-- 
cgit v1.1


From d8878ba3f05ae5bbfad5a6e72e5121c0ea35f989 Mon Sep 17 00:00:00 2001
From: Michael Kerrisk <mtk.manpages@googlemail.com>
Date: Fri, 25 Jul 2008 01:47:32 -0700
Subject: signals: make siginfo_t si_utime + si_sstime report times in USER_HZ,
 not HZ

In the switch to configurable HZ in 2.6, the treatment of the si_utime and
si_stime fields that are exposed to userland via the siginfo structure
looks to have been botched.  As things stand, these fields report times in
units of HZ, so that userland gets information that varies depending on
the HZ that the kernel was configured with.  This patch changes the
reported values to use USER_HZ units.

Signed-off-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 0514da5..ba60eee 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1370,10 +1370,9 @@ void do_notify_parent(struct task_struct *tsk, int sig)
 
 	info.si_uid = tsk->uid;
 
-	/* FIXME: find out whether or not this is supposed to be c*time. */
-	info.si_utime = cputime_to_jiffies(cputime_add(tsk->utime,
+	info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
 						       tsk->signal->utime));
-	info.si_stime = cputime_to_jiffies(cputime_add(tsk->stime,
+	info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
 						       tsk->signal->stime));
 
 	info.si_status = tsk->exit_code & 0x7f;
@@ -1441,9 +1440,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 
 	info.si_uid = tsk->uid;
 
-	/* FIXME: find out whether or not this is supposed to be c*time. */
-	info.si_utime = cputime_to_jiffies(tsk->utime);
-	info.si_stime = cputime_to_jiffies(tsk->stime);
+	info.si_utime = cputime_to_clock_t(tsk->utime);
+	info.si_stime = cputime_to_clock_t(tsk->stime);
 
  	info.si_code = why;
  	switch (why) {
-- 
cgit v1.1


From bc64efd220dcd4449aef8dd2564d73127b583b09 Mon Sep 17 00:00:00 2001
From: Gustavo Fernando Padovan <gustavo@las.ic.unicamp.br>
Date: Fri, 25 Jul 2008 01:47:33 -0700
Subject: kernel/signal.c: change vars pid and tgid types to pid_t

Change the type of pid and tgid variables from int to the POSIX type
pid_t.

Signed-off-by: Gustavo F. Padovan <gustavo@las.ic.unicamp.br>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index ba60eee..fdab7b3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1116,7 +1116,7 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
  * is probably wrong.  Should make it like BSD or SYSV.
  */
 
-static int kill_something_info(int sig, struct siginfo *info, int pid)
+static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
 {
 	int ret;
 
@@ -2184,7 +2184,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
 }
 
 asmlinkage long
-sys_kill(int pid, int sig)
+sys_kill(pid_t pid, int sig)
 {
 	struct siginfo info;
 
@@ -2197,7 +2197,7 @@ sys_kill(int pid, int sig)
 	return kill_something_info(sig, &info, pid);
 }
 
-static int do_tkill(int tgid, int pid, int sig)
+static int do_tkill(pid_t tgid, pid_t pid, int sig)
 {
 	int error;
 	struct siginfo info;
@@ -2243,7 +2243,7 @@ static int do_tkill(int tgid, int pid, int sig)
  *  exists but it's not belonging to the target process anymore. This
  *  method solves the problem of threads exiting and PIDs getting reused.
  */
-asmlinkage long sys_tgkill(int tgid, int pid, int sig)
+asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig)
 {
 	/* This is only valid for single tasks */
 	if (pid <= 0 || tgid <= 0)
@@ -2256,7 +2256,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
  *  Send a signal to only one task, even if it's a CLONE_THREAD task.
  */
 asmlinkage long
-sys_tkill(int pid, int sig)
+sys_tkill(pid_t pid, int sig)
 {
 	/* This is only valid for single tasks */
 	if (pid <= 0)
@@ -2266,7 +2266,7 @@ sys_tkill(int pid, int sig)
 }
 
 asmlinkage long
-sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
+sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo)
 {
 	siginfo_t info;
 
-- 
cgit v1.1


From 3d749b9e676b26584a47e75c235aa6f69d0697ae Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:37 -0700
Subject: ptrace: simplify ptrace_stop()->sigkill_pending() path

1. SIGKILL can't be blocked, remove this check from sigkill_pending().

2. When ptrace_stop() sees sigkill_pending() == T, it can just return.
   Kill "int killed" and simplify the code. This also is more correct,
   the tracer shouldn't see us in TASK_TRACED if we are not going to
   stop.

I strongly believe this code needs further changes.  We should do the "was
this task killed" check unconditionally, currently it depends on
arch_ptrace_stop_needed().  On the other hand, sigkill_pending() isn't
very clever.  If the task was killed tkill(SIGKILL), the signal can be
already dequeued if the caller is do_exit().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index fdab7b3..39c1706 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1496,9 +1496,8 @@ static inline int may_ptrace_stop(void)
  */
 static int sigkill_pending(struct task_struct *tsk)
 {
-	return ((sigismember(&tsk->pending.signal, SIGKILL) ||
-		 sigismember(&tsk->signal->shared_pending.signal, SIGKILL)) &&
-		!unlikely(sigismember(&tsk->blocked, SIGKILL)));
+	return	sigismember(&tsk->pending.signal, SIGKILL) ||
+		sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
 }
 
 /*
@@ -1514,8 +1513,6 @@ static int sigkill_pending(struct task_struct *tsk)
  */
 static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
 {
-	int killed = 0;
-
 	if (arch_ptrace_stop_needed(exit_code, info)) {
 		/*
 		 * The arch code has something special to do before a
@@ -1531,7 +1528,8 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
 		spin_unlock_irq(&current->sighand->siglock);
 		arch_ptrace_stop(exit_code, info);
 		spin_lock_irq(&current->sighand->siglock);
-		killed = sigkill_pending(current);
+		if (sigkill_pending(current))
+			return;
 	}
 
 	/*
@@ -1548,7 +1546,7 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
 	__set_current_state(TASK_TRACED);
 	spin_unlock_irq(&current->sighand->siglock);
 	read_lock(&tasklist_lock);
-	if (!unlikely(killed) && may_ptrace_stop()) {
+	if (may_ptrace_stop()) {
 		do_notify_parent_cldstop(current, CLD_TRAPPED);
 		read_unlock(&tasklist_lock);
 		schedule();
-- 
cgit v1.1


From 7b34e4283c685f5cc6ba6d30e939906eee0d4bcf Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:37 -0700
Subject: introduce PF_KTHREAD flag

Introduce the new PF_KTHREAD flag to mark the kernel threads.  It is set
by INIT_TASK() and copied to the forked childs (we could set it in
kthreadd() along with PF_NOFREEZE instead).

daemonize() was changed as well.  In that case testing of PF_KTHREAD is
racy, but daemonize() is hopeless anyway.

This flag is cleared in do_execve(), before search_binary_handler().
Probably not the best place, we can do this in exec_mmap() or in
start_thread(), or clear it along with PF_FORKNOEXEC.  But I think this
doesn't matter in practice, and if do_execve() fails kthread should die
soon.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index a7799d8..28a44a2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -430,7 +430,7 @@ void daemonize(const char *name, ...)
 	 * We don't want to have TIF_FREEZE set if the system-wide hibernation
 	 * or suspend transition begins right now.
 	 */
-	current->flags |= PF_NOFREEZE;
+	current->flags |= (PF_NOFREEZE | PF_KTHREAD);
 
 	if (current->nsproxy != &init_nsproxy) {
 		get_nsproxy(&init_nsproxy);
-- 
cgit v1.1


From 246bb0b1deb29726990620d8b5e55ca29f331362 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:38 -0700
Subject: kill PF_BORROWED_MM in favour of PF_KTHREAD

Kill PF_BORROWED_MM.  Change use_mm/unuse_mm to not play with ->flags, and
do s/PF_BORROWED_MM/PF_KTHREAD/ for a couple of other users.

No functional changes yet.  But this allows us to do further
fixes/cleanups.

oom_kill/ptrace/etc often check "p->mm != NULL" to filter out the
kthreads, this is wrong because of use_mm().  The problem with
PF_BORROWED_MM is that we need task_lock() to avoid races.  With this
patch we can check PF_KTHREAD directly, or use a simple lockless helper:

	/* The result must not be dereferenced !!! */
	struct mm_struct *__get_task_mm(struct task_struct *tsk)
	{
		if (tsk->flags & PF_KTHREAD)
			return NULL;
		return tsk->mm;
	}

Note also ecard_task().  It runs with ->mm != NULL, but it's the kernel
thread without PF_BORROWED_MM.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 228f80c..eeaec68 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -474,7 +474,7 @@ EXPORT_SYMBOL_GPL(mmput);
 /**
  * get_task_mm - acquire a reference to the task's mm
  *
- * Returns %NULL if the task has no mm.  Checks PF_BORROWED_MM (meaning
+ * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
  * this kernel workthread has transiently adopted a user mm with use_mm,
  * to do its AIO) is not set and if so returns a reference to it, after
  * bumping up the use count.  User must release the mm via mmput()
@@ -487,7 +487,7 @@ struct mm_struct *get_task_mm(struct task_struct *task)
 	task_lock(task);
 	mm = task->mm;
 	if (mm) {
-		if (task->flags & PF_BORROWED_MM)
+		if (task->flags & PF_KTHREAD)
 			mm = NULL;
 		else
 			atomic_inc(&mm->mm_users);
-- 
cgit v1.1


From 32ecb1f26dd50eeaac4e3f4dea4541c97848e459 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:41 -0700
Subject: coredump: turn mm->core_startup_done into the pointer to struct
 core_state

mm->core_startup_done points to "struct completion startup_done" allocated
on the coredump_wait()'s stack.  Introduce the new structure, core_state,
which holds this "struct completion".  This way we can add more info
visible to the threads participating in coredump without enlarging
mm_struct.

No changes in affected .o files.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 28a44a2..f7fa21d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -680,7 +680,7 @@ static void exit_mm(struct task_struct * tsk)
 		up_read(&mm->mmap_sem);
 		down_write(&mm->mmap_sem);
 		if (!--mm->core_waiters)
-			complete(mm->core_startup_done);
+			complete(&mm->core_state->startup);
 		up_write(&mm->mmap_sem);
 
 		wait_for_completion(&mm->core_done);
-- 
cgit v1.1


From 999d9fc1670bc082928b93b11d1f2e0e417d973c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:41 -0700
Subject: coredump: move mm->core_waiters into struct core_state

Move mm->core_waiters into "struct core_state" allocated on stack.  This
shrinks mm_struct a little bit and allows further changes.

This patch mostly does s/core_waiters/core_state.  The only essential
change is that coredump_wait() must clear mm->core_state before return.

The coredump_wait()'s path is uglified and .text grows by 30 bytes, this
is fixed by the next patch.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c   | 8 ++++----
 kernel/fork.c   | 2 +-
 kernel/signal.c | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index f7fa21d..988e232 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -670,16 +670,16 @@ static void exit_mm(struct task_struct * tsk)
 		return;
 	/*
 	 * Serialize with any possible pending coredump.
-	 * We must hold mmap_sem around checking core_waiters
+	 * We must hold mmap_sem around checking core_state
 	 * and clearing tsk->mm.  The core-inducing thread
-	 * will increment core_waiters for each thread in the
+	 * will increment ->nr_threads for each thread in the
 	 * group with ->mm != NULL.
 	 */
 	down_read(&mm->mmap_sem);
-	if (mm->core_waiters) {
+	if (mm->core_state) {
 		up_read(&mm->mmap_sem);
 		down_write(&mm->mmap_sem);
-		if (!--mm->core_waiters)
+		if (!--mm->core_state->nr_threads)
 			complete(&mm->core_state->startup);
 		up_write(&mm->mmap_sem);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index eeaec68..813d5c8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -400,7 +400,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->flags = (current->mm) ? current->mm->flags
 				  : MMF_DUMP_FILTER_DEFAULT;
-	mm->core_waiters = 0;
+	mm->core_state = NULL;
 	mm->nr_ptes = 0;
 	set_mm_counter(mm, file_rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
diff --git a/kernel/signal.c b/kernel/signal.c
index 39c1706..5c7b7ea 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1480,10 +1480,10 @@ static inline int may_ptrace_stop(void)
 	 * is a deadlock situation, and pointless because our tracer
 	 * is dead so don't allow us to stop.
 	 * If SIGKILL was already sent before the caller unlocked
-	 * ->siglock we must see ->core_waiters != 0. Otherwise it
+	 * ->siglock we must see ->core_state != NULL. Otherwise it
 	 * is safe to enter schedule().
 	 */
-	if (unlikely(current->mm->core_waiters) &&
+	if (unlikely(current->mm->core_state) &&
 	    unlikely(current->mm == current->parent->mm))
 		return 0;
 
-- 
cgit v1.1


From c5f1cc8c1828486a61ab3e575da6e2c62b34d399 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:42 -0700
Subject: coredump: turn core_state->nr_threads into atomic_t

Turn core_state->nr_threads into atomic_t and kill now unneeded
down_write(&mm->mmap_sem) in exit_mm().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 988e232..63d8295 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -678,10 +678,9 @@ static void exit_mm(struct task_struct * tsk)
 	down_read(&mm->mmap_sem);
 	if (mm->core_state) {
 		up_read(&mm->mmap_sem);
-		down_write(&mm->mmap_sem);
-		if (!--mm->core_state->nr_threads)
+
+		if (atomic_dec_and_test(&mm->core_state->nr_threads))
 			complete(&mm->core_state->startup);
-		up_write(&mm->mmap_sem);
 
 		wait_for_completion(&mm->core_done);
 		down_read(&mm->mmap_sem);
-- 
cgit v1.1


From b564daf806d492dd4f7afe9b6c83b8d35d137669 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:44 -0700
Subject: coredump: construct the list of coredumping threads at startup time

binfmt->core_dump() has to iterate over the all threads in system in order
to find the coredumping threads and construct the list using the
GFP_ATOMIC allocations.

With this patch each thread allocates the list node on exit_mm()'s stack and
adds itself to the list.

This allows us to do further changes:

	- simplify ->core_dump()

	- change exit_mm() to clear ->mm first, then wait for ->core_done.
	  this makes the coredumping process visible to oom_kill

	- kill mm->core_done

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 63d8295..b66f0d5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -664,6 +664,7 @@ assign_new_owner:
 static void exit_mm(struct task_struct * tsk)
 {
 	struct mm_struct *mm = tsk->mm;
+	struct core_state *core_state;
 
 	mm_release(tsk, mm);
 	if (!mm)
@@ -676,11 +677,19 @@ static void exit_mm(struct task_struct * tsk)
 	 * group with ->mm != NULL.
 	 */
 	down_read(&mm->mmap_sem);
-	if (mm->core_state) {
+	core_state = mm->core_state;
+	if (core_state) {
+		struct core_thread self;
 		up_read(&mm->mmap_sem);
 
-		if (atomic_dec_and_test(&mm->core_state->nr_threads))
-			complete(&mm->core_state->startup);
+		self.task = tsk;
+		self.next = xchg(&core_state->dumper.next, &self);
+		/*
+		 * Implies mb(), the result of xchg() must be visible
+		 * to core_state->dumper.
+		 */
+		if (atomic_dec_and_test(&core_state->nr_threads))
+			complete(&core_state->startup);
 
 		wait_for_completion(&mm->core_done);
 		down_read(&mm->mmap_sem);
-- 
cgit v1.1


From a94e2d408eaedbd85aae259621d46fafc10479a2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:46 -0700
Subject: coredump: kill mm->core_done

Now that we have core_state->dumper list we can use it to wake up the
sub-threads waiting for the coredump completion.

This uglifies the code and .text grows by 47 bytes, but otoh mm_struct
lessens by sizeof(struct completion).  Also, with this change we can
decouple exit_mm() from the coredumping code.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index b66f0d5..8a4d4d1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -691,7 +691,13 @@ static void exit_mm(struct task_struct * tsk)
 		if (atomic_dec_and_test(&core_state->nr_threads))
 			complete(&core_state->startup);
 
-		wait_for_completion(&mm->core_done);
+		for (;;) {
+			set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+			if (!self.task) /* see coredump_finish() */
+				break;
+			schedule();
+		}
+		__set_task_state(tsk, TASK_RUNNING);
 		down_read(&mm->mmap_sem);
 	}
 	atomic_inc(&mm->mm_count);
-- 
cgit v1.1


From 1a4d9b0aa0d3c50314e57525a5e5ec2cfc48b4c8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:47 -0700
Subject: workqueues: insert_work: use "list_head *" instead of "int tail"

insert_work() inserts the new work_struct before or after cwq->worklist,
depending on the "int tail" parameter. Change it to accept "list_head *"
instead, this shrinks .text a bit and allows us to insert the barrier
after specific work_struct.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jarek Poplawski <jarkao2@gmail.com>
Cc: Max Krasnyansky <maxk@qualcomm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6fd158b..d9a2d65 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -125,7 +125,7 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
 }
 
 static void insert_work(struct cpu_workqueue_struct *cwq,
-				struct work_struct *work, int tail)
+			struct work_struct *work, struct list_head *head)
 {
 	set_wq_data(work, cwq);
 	/*
@@ -133,10 +133,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 	 * result of list_add() below, see try_to_grab_pending().
 	 */
 	smp_wmb();
-	if (tail)
-		list_add_tail(&work->entry, &cwq->worklist);
-	else
-		list_add(&work->entry, &cwq->worklist);
+	list_add_tail(&work->entry, head);
 	wake_up(&cwq->more_work);
 }
 
@@ -146,7 +143,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 	unsigned long flags;
 
 	spin_lock_irqsave(&cwq->lock, flags);
-	insert_work(cwq, work, 1);
+	insert_work(cwq, work, &cwq->worklist);
 	spin_unlock_irqrestore(&cwq->lock, flags);
 }
 
@@ -361,14 +358,14 @@ static void wq_barrier_func(struct work_struct *work)
 }
 
 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
-					struct wq_barrier *barr, int tail)
+			struct wq_barrier *barr, struct list_head *head)
 {
 	INIT_WORK(&barr->work, wq_barrier_func);
 	__set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
 
 	init_completion(&barr->done);
 
-	insert_work(cwq, &barr->work, tail);
+	insert_work(cwq, &barr->work, head);
 }
 
 static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
@@ -388,7 +385,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
 		active = 0;
 		spin_lock_irq(&cwq->lock);
 		if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
-			insert_wq_barrier(cwq, &barr, 1);
+			insert_wq_barrier(cwq, &barr, &cwq->worklist);
 			active = 1;
 		}
 		spin_unlock_irq(&cwq->lock);
@@ -473,7 +470,7 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
 
 	spin_lock_irq(&cwq->lock);
 	if (unlikely(cwq->current_work == work)) {
-		insert_wq_barrier(cwq, &barr, 0);
+		insert_wq_barrier(cwq, &barr, cwq->worklist.next);
 		running = 1;
 	}
 	spin_unlock_irq(&cwq->lock);
-- 
cgit v1.1


From db700897224b5ebdf852f2d38920ce428940d059 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:49 -0700
Subject: workqueues: implement flush_work()

Most of users of flush_workqueue() can be changed to use cancel_work_sync(),
but sometimes we really need to wait for the completion and cancelling is not
an option. schedule_on_each_cpu() is good example.

Add the new helper, flush_work(work), which waits for the completion of the
specific work_struct. More precisely, it "flushes" the result of of the last
queue_work() which is visible to the caller.

For example, this code

	queue_work(wq, work);
	/* WINDOW */
	queue_work(wq, work);

	flush_work(work);

doesn't necessary work "as expected". What can happen in the WINDOW above is

	- wq starts the execution of work->func()

	- the caller migrates to another CPU

now, after the 2nd queue_work() this work is active on the previous CPU, and
at the same time it is queued on another. In this case flush_work(work) may
return before the first work->func() completes.

It is trivial to add another helper

	int flush_work_sync(struct work_struct *work)
	{
		return flush_work(work) || wait_on_work(work);
	}

which works "more correctly", but it has to iterate over all CPUs and thus
it much slower than flush_work().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Jarek Poplawski <jarkao2@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d9a2d65..ee41cf8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -423,6 +423,52 @@ void flush_workqueue(struct workqueue_struct *wq)
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
 
+/**
+ * flush_work - block until a work_struct's callback has terminated
+ * @work: the work which is to be flushed
+ *
+ * It is expected that, prior to calling flush_work(), the caller has
+ * arranged for the work to not be requeued, otherwise it doesn't make
+ * sense to use this function.
+ */
+int flush_work(struct work_struct *work)
+{
+	struct cpu_workqueue_struct *cwq;
+	struct list_head *prev;
+	struct wq_barrier barr;
+
+	might_sleep();
+	cwq = get_wq_data(work);
+	if (!cwq)
+		return 0;
+
+	prev = NULL;
+	spin_lock_irq(&cwq->lock);
+	if (!list_empty(&work->entry)) {
+		/*
+		 * See the comment near try_to_grab_pending()->smp_rmb().
+		 * If it was re-queued under us we are not going to wait.
+		 */
+		smp_rmb();
+		if (unlikely(cwq != get_wq_data(work)))
+			goto out;
+		prev = &work->entry;
+	} else {
+		if (cwq->current_work != work)
+			goto out;
+		prev = &cwq->worklist;
+	}
+	insert_wq_barrier(cwq, &barr, prev->next);
+out:
+	spin_unlock_irq(&cwq->lock);
+	if (!prev)
+		return 0;
+
+	wait_for_completion(&barr.done);
+	return 1;
+}
+EXPORT_SYMBOL_GPL(flush_work);
+
 /*
  * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
  * so this work can't be re-armed in any way.
-- 
cgit v1.1


From 8616a89ab761239c963eea3a63be383f127cc7e8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:49 -0700
Subject: workqueues: schedule_on_each_cpu: use flush_work()

Change schedule_on_each_cpu() to use flush_work() instead of
flush_workqueue(), this way we don't wait for other work_struct's which
can be queued meanwhile.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jarek Poplawski <jarkao2@gmail.com>
Cc: Max Krasnyansky <maxk@qualcomm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ee41cf8..5fbffd3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -690,7 +690,8 @@ int schedule_on_each_cpu(work_func_t func)
 		set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
 		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
 	}
-	flush_workqueue(keventd_wq);
+	for_each_online_cpu(cpu)
+		flush_work(per_cpu_ptr(works, cpu));
 	put_online_cpus();
 	free_percpu(works);
 	return 0;
-- 
cgit v1.1


From 3da1c84c00c7e5fa8348336bd8c342f9128b0f14 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:50 -0700
Subject: workqueues: make get_online_cpus() useable for work->func()

workqueue_cpu_callback(CPU_DEAD) flushes cwq->thread under
cpu_maps_update_begin().  This means that the multithreaded workqueues
can't use get_online_cpus() due to the possible deadlock, very bad and
very old problem.

Introduce the new state, CPU_POST_DEAD, which is called after
cpu_hotplug_done() but before cpu_maps_update_done().

Change workqueue_cpu_callback() to use CPU_POST_DEAD instead of CPU_DEAD.
This means that create/destroy functions can't rely on get_online_cpus()
any longer and should take cpu_add_remove_lock instead.

[akpm@linux-foundation.org: fix CONFIG_SMP=n]
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Gautham R Shenoy <ego@in.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Max Krasnyansky <maxk@qualcomm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Paul Menage <menage@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu.c       |  5 +++++
 kernel/workqueue.c | 18 +++++++++---------
 2 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2cc409c..10ba5f1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -285,6 +285,11 @@ out_allowed:
 	set_cpus_allowed_ptr(current, &old_allowed);
 out_release:
 	cpu_hotplug_done();
+	if (!err) {
+		if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod,
+					    hcpu) == NOTIFY_BAD)
+			BUG();
+	}
 	return err;
 }
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5fbffd3..828e582 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -828,7 +828,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 		err = create_workqueue_thread(cwq, singlethread_cpu);
 		start_workqueue_thread(cwq, -1);
 	} else {
-		get_online_cpus();
+		cpu_maps_update_begin();
 		spin_lock(&workqueue_lock);
 		list_add(&wq->list, &workqueues);
 		spin_unlock(&workqueue_lock);
@@ -840,7 +840,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 			err = create_workqueue_thread(cwq, cpu);
 			start_workqueue_thread(cwq, cpu);
 		}
-		put_online_cpus();
+		cpu_maps_update_done();
 	}
 
 	if (err) {
@@ -854,8 +854,8 @@ EXPORT_SYMBOL_GPL(__create_workqueue_key);
 static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 {
 	/*
-	 * Our caller is either destroy_workqueue() or CPU_DEAD,
-	 * get_online_cpus() protects cwq->thread.
+	 * Our caller is either destroy_workqueue() or CPU_POST_DEAD,
+	 * cpu_add_remove_lock protects cwq->thread.
 	 */
 	if (cwq->thread == NULL)
 		return;
@@ -865,7 +865,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 
 	flush_cpu_workqueue(cwq);
 	/*
-	 * If the caller is CPU_DEAD and cwq->worklist was not empty,
+	 * If the caller is CPU_POST_DEAD and cwq->worklist was not empty,
 	 * a concurrent flush_workqueue() can insert a barrier after us.
 	 * However, in that case run_workqueue() won't return and check
 	 * kthread_should_stop() until it flushes all work_struct's.
@@ -889,14 +889,14 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	const cpumask_t *cpu_map = wq_cpu_map(wq);
 	int cpu;
 
-	get_online_cpus();
+	cpu_maps_update_begin();
 	spin_lock(&workqueue_lock);
 	list_del(&wq->list);
 	spin_unlock(&workqueue_lock);
 
 	for_each_cpu_mask_nr(cpu, *cpu_map)
 		cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
-	put_online_cpus();
+ 	cpu_maps_update_done();
 
 	free_percpu(wq->cpu_wq);
 	kfree(wq);
@@ -935,7 +935,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 
 		case CPU_UP_CANCELED:
 			start_workqueue_thread(cwq, -1);
-		case CPU_DEAD:
+		case CPU_POST_DEAD:
 			cleanup_workqueue_thread(cwq);
 			break;
 		}
@@ -943,7 +943,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_UP_CANCELED:
-	case CPU_DEAD:
+	case CPU_POST_DEAD:
 		cpu_clear(cpu, cpu_populated_map);
 	}
 
-- 
cgit v1.1


From a67da70dc0955580665f5444f318b92e69a3c272 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:52 -0700
Subject: workqueues: lockdep annotations for flush_work()

Add lockdep annotations to flush_work() and update the comment.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jarek Poplawski <jarkao2@o2.pl>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 828e582..4fcb75b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -427,6 +427,8 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
  * flush_work - block until a work_struct's callback has terminated
  * @work: the work which is to be flushed
  *
+ * Returns false if @work has already terminated.
+ *
  * It is expected that, prior to calling flush_work(), the caller has
  * arranged for the work to not be requeued, otherwise it doesn't make
  * sense to use this function.
@@ -442,6 +444,9 @@ int flush_work(struct work_struct *work)
 	if (!cwq)
 		return 0;
 
+	lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+	lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+
 	prev = NULL;
 	spin_lock_irq(&cwq->lock);
 	if (!list_empty(&work->entry)) {
-- 
cgit v1.1


From ef1ca236b8d645349ed6569598ae3f6c1b9511c0 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:53 -0700
Subject: workqueues: queue_work() can use queue_work_on()

queue_work() can use queue_work_on() to avoid the code duplication.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4fcb75b..fe08a85 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -159,14 +159,11 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
  */
 int queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
-	int ret = 0;
+	int ret;
+
+	ret = queue_work_on(get_cpu(), wq, work);
+	put_cpu();
 
-	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
-		BUG_ON(!list_empty(&work->entry));
-		__queue_work(wq_per_cpu(wq, get_cpu()), work);
-		put_cpu();
-		ret = 1;
-	}
 	return ret;
 }
 EXPORT_SYMBOL_GPL(queue_work);
-- 
cgit v1.1


From 8de6d308bab4f67fcf953562f9f08f9527cad72d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:53 -0700
Subject: workqueues: schedule_on_each_cpu() can use schedule_work_on()

schedule_on_each_cpu() can use schedule_work_on() to avoid the code
duplication.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index fe08a85..7cf4303 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -689,8 +689,7 @@ int schedule_on_each_cpu(work_func_t func)
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
 		INIT_WORK(work, func);
-		set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
-		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
+		schedule_work_on(cpu, work);
 	}
 	for_each_online_cpu(cpu)
 		flush_work(per_cpu_ptr(works, cpu));
-- 
cgit v1.1


From 8448502cfc915f70e3f8923849ade27d472044cb Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:54 -0700
Subject: workqueues: do CPU_UP_CANCELED if CPU_UP_PREPARE fails

The bug was pointed out by Akinobu Mita <akinobu.mita@gmail.com>, and this
patch is based on his original patch.

workqueue_cpu_callback(CPU_UP_PREPARE) expects that if it returns
NOTIFY_BAD, _cpu_up() will send CPU_UP_CANCELED then.

However, this is not true since

	"cpu hotplug: cpu: deliver CPU_UP_CANCELED only to NOTIFY_OKed callbacks with CPU_UP_PREPARE"
	commit: a0d8cdb652d35af9319a9e0fb7134de2a276c636

The callback which has returned NOTIFY_BAD will not receive
CPU_UP_CANCELED.  Change the code to fulfil the CPU_UP_CANCELED logic if
CPU_UP_PREPARE fails.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Reported-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7cf4303..ec7e4f6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -911,6 +911,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	unsigned int cpu = (unsigned long)hcpu;
 	struct cpu_workqueue_struct *cwq;
 	struct workqueue_struct *wq;
+	int ret = NOTIFY_OK;
 
 	action &= ~CPU_TASKS_FROZEN;
 
@@ -918,7 +919,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	case CPU_UP_PREPARE:
 		cpu_set(cpu, cpu_populated_map);
 	}
-
+undo:
 	list_for_each_entry(wq, &workqueues, list) {
 		cwq = per_cpu_ptr(wq->cpu_wq, cpu);
 
@@ -928,7 +929,9 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 				break;
 			printk(KERN_ERR "workqueue [%s] for %i failed\n",
 				wq->name, cpu);
-			return NOTIFY_BAD;
+			action = CPU_UP_CANCELED;
+			ret = NOTIFY_BAD;
+			goto undo;
 
 		case CPU_ONLINE:
 			start_workqueue_thread(cwq, cpu);
@@ -948,7 +951,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		cpu_clear(cpu, cpu_populated_map);
 	}
 
-	return NOTIFY_OK;
+	return ret;
 }
 
 void __init init_workqueues(void)
-- 
cgit v1.1


From 339caf2a224fc9af0f01686bf287dda32c6efca6 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Fri, 25 Jul 2008 01:48:31 -0700
Subject: proc: misplaced export of find_get_pid

Move EXPORT_SYMBOL right after the func

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 30bd5d4..753fd90 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -435,6 +435,7 @@ struct pid *find_get_pid(pid_t nr)
 
 	return pid;
 }
+EXPORT_SYMBOL_GPL(find_get_pid);
 
 pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
 {
@@ -497,7 +498,6 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 
 	return pid;
 }
-EXPORT_SYMBOL_GPL(find_get_pid);
 
 /*
  * The pid hash table is scaled according to the amount of memory in the
-- 
cgit v1.1


From 99541c23cd32bacf1a591ca537a7c0cb9053ad7e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@parallels.com>
Date: Fri, 25 Jul 2008 01:48:31 -0700
Subject: sysctl: check for bogus modes

Catch, e. g., 644/0644 typo.

Signed-off-by: Alexey Dobriyan <adobriyan@parallels.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl_check.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index c09350d..c35da23a 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1532,6 +1532,8 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
 			sysctl_check_leaf(namespaces, table, &fail);
 		}
 		sysctl_check_bin_path(table, &fail);
+		if (table->mode > 0777)
+			set_fail(&fail, table, "bogus .mode");
 		if (fail) {
 			set_fail(&fail, table, NULL);
 			error = -EINVAL;
-- 
cgit v1.1


From 19b0cfcca41dd772065671ad0584e1cea0f3fd13 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:35 -0700
Subject: pidns: remove now unused kill_proc function

This function operated on a pid_t to kill a task, which is no longer valid
in a containerized system.

It has finally lost all its users and we can safely remove it from the
tree.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 5c7b7ea..82c3545 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1228,17 +1228,6 @@ int kill_pid(struct pid *pid, int sig, int priv)
 }
 EXPORT_SYMBOL(kill_pid);
 
-int
-kill_proc(pid_t pid, int sig, int priv)
-{
-	int ret;
-
-	rcu_read_lock();
-	ret = kill_pid_info(sig, __si_special(priv), find_pid(pid));
-	rcu_read_unlock();
-	return ret;
-}
-
 /*
  * These functions support sending signals using preallocated sigqueue
  * structures.  This is needed "because realtime applications cannot
@@ -1906,7 +1895,6 @@ EXPORT_SYMBOL(recalc_sigpending);
 EXPORT_SYMBOL_GPL(dequeue_signal);
 EXPORT_SYMBOL(flush_signals);
 EXPORT_SYMBOL(force_sig);
-EXPORT_SYMBOL(kill_proc);
 EXPORT_SYMBOL(ptrace_notify);
 EXPORT_SYMBOL(send_sig);
 EXPORT_SYMBOL(send_sig_info);
-- 
cgit v1.1


From e49859e71e0318b564de1546bdc30fab738f9deb Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:36 -0700
Subject: pidns: remove now unused find_pid function.

This one had the only users so far - the kill_proc, which is removed, so
drop this (invalid in namespaced world) call too.

And of course - erase all references on it from comments.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 753fd90..064e76a 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -309,12 +309,6 @@ struct pid *find_vpid(int nr)
 }
 EXPORT_SYMBOL_GPL(find_vpid);
 
-struct pid *find_pid(int nr)
-{
-	return find_pid_ns(nr, &init_pid_ns);
-}
-EXPORT_SYMBOL_GPL(find_pid);
-
 /*
  * attach_pid() must be called with the tasklist_lock write-held.
  */
@@ -483,7 +477,7 @@ EXPORT_SYMBOL(task_session_nr_ns);
 /*
  * Used by proc to find the first pid that is greater then or equal to nr.
  *
- * If there is a pid at nr this function is exactly the same as find_pid.
+ * If there is a pid at nr this function is exactly the same as find_pid_ns.
  */
 struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 {
-- 
cgit v1.1


From 28325df0d9339b7f3aba9c45174d4586223ef46b Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 25 Jul 2008 01:48:38 -0700
Subject: markers: use rcu_barrier_sched() and call_rcu_sched()

rcu_barrier_sched() and call_rcu_sched() were introduced in 2.6.26 for the
Markers.  Change the marker code to use them.

It can be seen as a fix since the marker code was using an ugly,
temporary, #ifdef hack to work around CONFIG_PREEMPT_RCU.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: Paul McKenney <paulmck@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/marker.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index 1abfb92..971da53 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -441,7 +441,7 @@ static int remove_marker(const char *name)
 	hlist_del(&e->hlist);
 	/* Make sure the call_rcu has been executed */
 	if (e->rcu_pending)
-		rcu_barrier();
+		rcu_barrier_sched();
 	kfree(e);
 	return 0;
 }
@@ -476,7 +476,7 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
 	hlist_del(&(*entry)->hlist);
 	/* Make sure the call_rcu has been executed */
 	if ((*entry)->rcu_pending)
-		rcu_barrier();
+		rcu_barrier_sched();
 	kfree(*entry);
 	*entry = e;
 	trace_mark(core_marker_format, "name %s format %s",
@@ -655,7 +655,7 @@ int marker_probe_register(const char *name, const char *format,
 	 * make sure it's executed now.
 	 */
 	if (entry->rcu_pending)
-		rcu_barrier();
+		rcu_barrier_sched();
 	old = marker_entry_add_probe(entry, probe, probe_private);
 	if (IS_ERR(old)) {
 		ret = PTR_ERR(old);
@@ -670,10 +670,7 @@ int marker_probe_register(const char *name, const char *format,
 	entry->rcu_pending = 1;
 	/* write rcu_pending before calling the RCU callback */
 	smp_wmb();
-#ifdef CONFIG_PREEMPT_RCU
-	synchronize_sched();	/* Until we have the call_rcu_sched() */
-#endif
-	call_rcu(&entry->rcu, free_old_closure);
+	call_rcu_sched(&entry->rcu, free_old_closure);
 end:
 	mutex_unlock(&markers_mutex);
 	return ret;
@@ -704,7 +701,7 @@ int marker_probe_unregister(const char *name,
 	if (!entry)
 		goto end;
 	if (entry->rcu_pending)
-		rcu_barrier();
+		rcu_barrier_sched();
 	old = marker_entry_remove_probe(entry, probe, probe_private);
 	mutex_unlock(&markers_mutex);
 	marker_update_probes();		/* may update entry */
@@ -716,10 +713,7 @@ int marker_probe_unregister(const char *name,
 	entry->rcu_pending = 1;
 	/* write rcu_pending before calling the RCU callback */
 	smp_wmb();
-#ifdef CONFIG_PREEMPT_RCU
-	synchronize_sched();	/* Until we have the call_rcu_sched() */
-#endif
-	call_rcu(&entry->rcu, free_old_closure);
+	call_rcu_sched(&entry->rcu, free_old_closure);
 	remove_marker(name);	/* Ignore busy error message */
 	ret = 0;
 end:
@@ -786,7 +780,7 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
 		goto end;
 	}
 	if (entry->rcu_pending)
-		rcu_barrier();
+		rcu_barrier_sched();
 	old = marker_entry_remove_probe(entry, NULL, probe_private);
 	mutex_unlock(&markers_mutex);
 	marker_update_probes();		/* may update entry */
@@ -797,10 +791,7 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
 	entry->rcu_pending = 1;
 	/* write rcu_pending before calling the RCU callback */
 	smp_wmb();
-#ifdef CONFIG_PREEMPT_RCU
-	synchronize_sched();	/* Until we have the call_rcu_sched() */
-#endif
-	call_rcu(&entry->rcu, free_old_closure);
+	call_rcu_sched(&entry->rcu, free_old_closure);
 	remove_marker(entry->name);	/* Ignore busy error message */
 end:
 	mutex_unlock(&markers_mutex);
-- 
cgit v1.1


From a89cc1959d0ea5f36bf7421dc97b34f03809637d Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Fri, 25 Jul 2008 01:48:39 -0700
Subject: markers: fix sparse integer as NULL pointer warning

kernel/trace/trace_sysprof.c:164:20: warning: Using plain integer as NULL pointer

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/trace/trace_sysprof.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 6352808..ce2d723 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -161,7 +161,7 @@ static void timer_notify(struct pt_regs *regs, int cpu)
 		__trace_special(tr, data, 2, regs->ip, 0);
 
 		while (i < sample_max_depth) {
-			frame.next_fp = 0;
+			frame.next_fp = NULL;
 			frame.return_address = 0;
 			if (!copy_stack_frame(fp, &frame))
 				break;
-- 
cgit v1.1


From 7394f0f6c0baab650ea9194cb1be847df646fb57 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 25 Jul 2008 01:48:40 -0700
Subject: unexport uts_sem

With the removal of the Solaris binary emulation the export of
uts_sem became unused.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 6c21880..0c9d3fa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1343,8 +1343,6 @@ EXPORT_SYMBOL(in_egroup_p);
 
 DECLARE_RWSEM(uts_sem);
 
-EXPORT_SYMBOL(uts_sem);
-
 asmlinkage long sys_newuname(struct new_utsname __user * name)
 {
 	int errno = 0;
-- 
cgit v1.1


From 49b5cf34727a6c1be1568ab28e89a2d9a6bf51e0 Mon Sep 17 00:00:00 2001
From: Jonathan Lim <jlim@sgi.com>
Date: Fri, 25 Jul 2008 01:48:40 -0700
Subject: accounting: account for user time when updating memory integrals

Adapt acct_update_integrals() to include user time when calculating the time
difference.  The units of acct_rss_mem1 and acct_vm_mem1 are also changed from
pages-jiffies to pages-usecs to avoid calling jiffies_to_usecs() in
xacct_add_tsk() which might overflow.

Signed-off-by: Jonathan Lim <jlim@sgi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c  |  2 ++
 kernel/tsacct.c | 21 ++++++++++++++-------
 2 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6acf749..0047bd9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4046,6 +4046,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
 	else
 		cpustat->user = cputime64_add(cpustat->user, tmp);
+	/* Account for user time used */
+	acct_update_integrals(p);
 }
 
 /*
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 4ab1b58..1da6990 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -84,9 +84,9 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 {
 	struct mm_struct *mm;
 
-	/* convert pages-jiffies to Mbyte-usec */
-	stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB;
-	stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB;
+	/* convert pages-usec to Mbyte-usec */
+	stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
+	stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
 	mm = get_task_mm(p);
 	if (mm) {
 		/* adjust to KB unit */
@@ -118,12 +118,19 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 void acct_update_integrals(struct task_struct *tsk)
 {
 	if (likely(tsk->mm)) {
-		long delta = cputime_to_jiffies(
-			cputime_sub(tsk->stime, tsk->acct_stimexpd));
+		cputime_t time, dtime;
+		struct timeval value;
+		u64 delta;
+
+		time = tsk->stime + tsk->utime;
+		dtime = cputime_sub(time, tsk->acct_timexpd);
+		jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
+		delta = value.tv_sec;
+		delta = delta * USEC_PER_SEC + value.tv_usec;
 
 		if (delta == 0)
 			return;
-		tsk->acct_stimexpd = tsk->stime;
+		tsk->acct_timexpd = time;
 		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
 		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
 	}
@@ -135,7 +142,7 @@ void acct_update_integrals(struct task_struct *tsk)
  */
 void acct_clear_integrals(struct task_struct *tsk)
 {
-	tsk->acct_stimexpd = 0;
+	tsk->acct_timexpd = 0;
 	tsk->acct_rss_mem1 = 0;
 	tsk->acct_vm_mem1 = 0;
 }
-- 
cgit v1.1


From 081e4c8a75692c21f3a119a81ca3270081879d0e Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:42 -0700
Subject: bsdacct: rename acct_gbls to bsd_acct_struct

After I fixed access to task->tgid in kernel/acct.c, Oleg pointed out some
bad side effects with this accounting vs pid namespaces interaction.  I.e.
 when some task in pid namespace sets this accounting up, this blocks all
the others from doing the same.  Restricting this to init namespace only
could help, but didn't look a graceful solution.

So here is the approach to make this accounting work with pid namespaces
properly.

The idea is simple - when a task dies it accounts itself in each namespace
it is visible from and which set the accounting up.

For example here are the commands run and the output of lastcomm from init
and sub namespaces:

init_ns# accton pacct
 sub_ns# accton pacct (this is a different file - sub ns is run in
                       a chroot-ed environment)
init_ns# cat /dev/null
 sub_ns# ls /dev/null
init_ns# accton
 sub_ns# accton

 sub_ns#  lastcomm -f pacct
ls                      0        [136,0]    0.00 secs Thu May 15 10:30
accton                  0        [136,0]    0.00 secs Thu May 15 10:30

init_ns# lastcomm -f pacct
accton                  root     pts/0      0.00 secs Thu May 15 14:30 << got from sub
cat                     root     pts/1      0.00 secs Thu May 15 14:30
ls                      root     pts/0      0.00 secs Thu May 15 14:30 << got from sub
accton                  root     pts/1      0.00 secs Thu May 15 14:30

That was the summary, the details are in patches.

This patch:

It will be visible in pid_namespace.h file, so fix its name to look better
outside the acct.c file.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 91e1cfd..ee3e605 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -82,7 +82,7 @@ static void do_acct_process(struct pid_namespace *ns, struct file *);
  * can be placed in the same cache line as the lock.  This primes
  * the cache line to have the data after getting the lock.
  */
-struct acct_glbs {
+struct bsd_acct_struct {
 	spinlock_t		lock;
 	volatile int		active;
 	volatile int		needcheck;
@@ -91,7 +91,7 @@ struct acct_glbs {
 	struct timer_list	timer;
 };
 
-static struct acct_glbs acct_globals __cacheline_aligned =
+static struct bsd_acct_struct acct_globals __cacheline_aligned =
 	{__SPIN_LOCK_UNLOCKED(acct_globals.lock)};
 
 /*
-- 
cgit v1.1


From 84406c153a5bfa5d8b428a0933e9d39db6b59a75 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:42 -0700
Subject: pidns: use kzalloc when allocating new pid_namespace struct

It makes many fields initialization implicit helping in auto-setting
#ifdef-ed fields (bsd-acct related pointer will be such).

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid_namespace.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 98702b4..06331cc 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -71,7 +71,7 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
 	struct pid_namespace *ns;
 	int i;
 
-	ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL);
+	ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
 	if (ns == NULL)
 		goto out;
 
@@ -84,17 +84,13 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
 		goto out_free_map;
 
 	kref_init(&ns->kref);
-	ns->last_pid = 0;
-	ns->child_reaper = NULL;
 	ns->level = level;
 
 	set_bit(0, ns->pidmap[0].page);
 	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
 
-	for (i = 1; i < PIDMAP_ENTRIES; i++) {
-		ns->pidmap[i].page = NULL;
+	for (i = 1; i < PIDMAP_ENTRIES; i++)
 		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
-	}
 
 	return ns;
 
-- 
cgit v1.1


From 1c552858ac2b1732a99d234d46b98098baef41ff Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:44 -0700
Subject: bsdacct: "truthify" a comment near acct_process

The acct_process does not accept any arguments actually.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index ee3e605..d9ee1838 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -579,7 +579,6 @@ void acct_collect(long exitcode, int group_dead)
 
 /**
  * acct_process - now just a wrapper around do_acct_process
- * @exitcode: task exit code
  *
  * handles process accounting for an exiting task
  */
-- 
cgit v1.1


From e59a04a7aa5ce2483470aee4f2eb79ba6b9afe8b Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:44 -0700
Subject: bsdacct: make check timer accept a bsd_acct_struct argument

We're going to have many bsd_acct_struct instances, not just one, so the
timer (currently working with a global one) has to know which one to work
with.

Use a handy setup_timer macro for it (thanks to Oleg for one).

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index d9ee1838..05f8bc0 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -97,9 +97,10 @@ static struct bsd_acct_struct acct_globals __cacheline_aligned =
 /*
  * Called whenever the timer says to check the free space.
  */
-static void acct_timeout(unsigned long unused)
+static void acct_timeout(unsigned long x)
 {
-	acct_globals.needcheck = 1;
+	struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
+	acct->needcheck = 1;
 }
 
 /*
@@ -193,8 +194,8 @@ static void acct_file_reopen(struct file *file)
 		acct_globals.needcheck = 0;
 		acct_globals.active = 1;
 		/* It's been deleted if it was used before so this is safe */
-		init_timer(&acct_globals.timer);
-		acct_globals.timer.function = acct_timeout;
+		setup_timer(&acct_globals.timer, acct_timeout,
+				(unsigned long)&acct_globals);
 		acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
 		add_timer(&acct_globals.timer);
 	}
-- 
cgit v1.1


From a75d97976517dcda69150fd81d6be86ae63324a1 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:45 -0700
Subject: bsdacct: turn the acct_lock from on-the-struct to global

Don't use per-bsd-acct-struct lock, but work with a global one.

This lock is taken for short periods, so it doesn't seem it'll become a
bottleneck, but it will allow us to easily avoid many locking difficulties
in the future.

So this is a mostly s/acct_globals.lock/acct_lock/ over the file.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 05f8bc0..fc71c13 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -83,7 +83,6 @@ static void do_acct_process(struct pid_namespace *ns, struct file *);
  * the cache line to have the data after getting the lock.
  */
 struct bsd_acct_struct {
-	spinlock_t		lock;
 	volatile int		active;
 	volatile int		needcheck;
 	struct file		*file;
@@ -91,8 +90,9 @@ struct bsd_acct_struct {
 	struct timer_list	timer;
 };
 
-static struct bsd_acct_struct acct_globals __cacheline_aligned =
-	{__SPIN_LOCK_UNLOCKED(acct_globals.lock)};
+static DEFINE_SPINLOCK(acct_lock);
+
+static struct bsd_acct_struct acct_globals __cacheline_aligned;
 
 /*
  * Called whenever the timer says to check the free space.
@@ -114,11 +114,11 @@ static int check_free_space(struct file *file)
 	sector_t resume;
 	sector_t suspend;
 
-	spin_lock(&acct_globals.lock);
+	spin_lock(&acct_lock);
 	res = acct_globals.active;
 	if (!file || !acct_globals.needcheck)
 		goto out;
-	spin_unlock(&acct_globals.lock);
+	spin_unlock(&acct_lock);
 
 	/* May block */
 	if (vfs_statfs(file->f_path.dentry, &sbuf))
@@ -140,7 +140,7 @@ static int check_free_space(struct file *file)
 	 * If some joker switched acct_globals.file under us we'ld better be
 	 * silent and _not_ touch anything.
 	 */
-	spin_lock(&acct_globals.lock);
+	spin_lock(&acct_lock);
 	if (file != acct_globals.file) {
 		if (act)
 			res = act>0;
@@ -165,7 +165,7 @@ static int check_free_space(struct file *file)
 	add_timer(&acct_globals.timer);
 	res = acct_globals.active;
 out:
-	spin_unlock(&acct_globals.lock);
+	spin_unlock(&acct_lock);
 	return res;
 }
 
@@ -173,7 +173,7 @@ out:
  * Close the old accounting file (if currently open) and then replace
  * it with file (if non-NULL).
  *
- * NOTE: acct_globals.lock MUST be held on entry and exit.
+ * NOTE: acct_lock MUST be held on entry and exit.
  */
 static void acct_file_reopen(struct file *file)
 {
@@ -201,11 +201,11 @@ static void acct_file_reopen(struct file *file)
 	}
 	if (old_acct) {
 		mnt_unpin(old_acct->f_path.mnt);
-		spin_unlock(&acct_globals.lock);
+		spin_unlock(&acct_lock);
 		do_acct_process(old_ns, old_acct);
 		filp_close(old_acct, NULL);
 		put_pid_ns(old_ns);
-		spin_lock(&acct_globals.lock);
+		spin_lock(&acct_lock);
 	}
 }
 
@@ -235,10 +235,10 @@ static int acct_on(char *name)
 		return error;
 	}
 
-	spin_lock(&acct_globals.lock);
+	spin_lock(&acct_lock);
 	mnt_pin(file->f_path.mnt);
 	acct_file_reopen(file);
-	spin_unlock(&acct_globals.lock);
+	spin_unlock(&acct_lock);
 
 	mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
 
@@ -272,9 +272,9 @@ asmlinkage long sys_acct(const char __user *name)
 	} else {
 		error = security_acct(NULL);
 		if (!error) {
-			spin_lock(&acct_globals.lock);
+			spin_lock(&acct_lock);
 			acct_file_reopen(NULL);
-			spin_unlock(&acct_globals.lock);
+			spin_unlock(&acct_lock);
 		}
 	}
 	return error;
@@ -289,10 +289,10 @@ asmlinkage long sys_acct(const char __user *name)
  */
 void acct_auto_close_mnt(struct vfsmount *m)
 {
-	spin_lock(&acct_globals.lock);
+	spin_lock(&acct_lock);
 	if (acct_globals.file && acct_globals.file->f_path.mnt == m)
 		acct_file_reopen(NULL);
-	spin_unlock(&acct_globals.lock);
+	spin_unlock(&acct_lock);
 }
 
 /**
@@ -304,12 +304,12 @@ void acct_auto_close_mnt(struct vfsmount *m)
  */
 void acct_auto_close(struct super_block *sb)
 {
-	spin_lock(&acct_globals.lock);
+	spin_lock(&acct_lock);
 	if (acct_globals.file &&
 	    acct_globals.file->f_path.mnt->mnt_sb == sb) {
 		acct_file_reopen(NULL);
 	}
-	spin_unlock(&acct_globals.lock);
+	spin_unlock(&acct_lock);
 }
 
 /*
@@ -594,15 +594,15 @@ void acct_process(void)
 	if (!acct_globals.file)
 		return;
 
-	spin_lock(&acct_globals.lock);
+	spin_lock(&acct_lock);
 	file = acct_globals.file;
 	if (unlikely(!file)) {
-		spin_unlock(&acct_globals.lock);
+		spin_unlock(&acct_lock);
 		return;
 	}
 	get_file(file);
 	ns = get_pid_ns(acct_globals.ns);
-	spin_unlock(&acct_globals.lock);
+	spin_unlock(&acct_lock);
 
 	do_acct_process(ns, file);
 	fput(file);
-- 
cgit v1.1


From 6248b1b342005a428b1247b4e89249da1528d88d Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:46 -0700
Subject: bsdacct: make internal code work with passed bsd_acct_struct, not
 global

This adds the appropriate pointer to all the internal (i.e.  static)
functions that work with global acct instance.  API calls pass a global
instance to them (while we still have such).

Mostly this is a s/acct_globals./acct->/ over the file.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c | 77 ++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 39 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index fc71c13..72d4760 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -75,7 +75,8 @@ int acct_parm[3] = {4, 2, 30};
 /*
  * External references and all of the globals.
  */
-static void do_acct_process(struct pid_namespace *ns, struct file *);
+static void do_acct_process(struct bsd_acct_struct *acct,
+		struct pid_namespace *ns, struct file *);
 
 /*
  * This structure is used so that all the data protected by lock
@@ -106,7 +107,7 @@ static void acct_timeout(unsigned long x)
 /*
  * Check the amount of free space and suspend/resume accordingly.
  */
-static int check_free_space(struct file *file)
+static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
 {
 	struct kstatfs sbuf;
 	int res;
@@ -115,8 +116,8 @@ static int check_free_space(struct file *file)
 	sector_t suspend;
 
 	spin_lock(&acct_lock);
-	res = acct_globals.active;
-	if (!file || !acct_globals.needcheck)
+	res = acct->active;
+	if (!file || !acct->needcheck)
 		goto out;
 	spin_unlock(&acct_lock);
 
@@ -137,33 +138,33 @@ static int check_free_space(struct file *file)
 		act = 0;
 
 	/*
-	 * If some joker switched acct_globals.file under us we'ld better be
+	 * If some joker switched acct->file under us we'ld better be
 	 * silent and _not_ touch anything.
 	 */
 	spin_lock(&acct_lock);
-	if (file != acct_globals.file) {
+	if (file != acct->file) {
 		if (act)
 			res = act>0;
 		goto out;
 	}
 
-	if (acct_globals.active) {
+	if (acct->active) {
 		if (act < 0) {
-			acct_globals.active = 0;
+			acct->active = 0;
 			printk(KERN_INFO "Process accounting paused\n");
 		}
 	} else {
 		if (act > 0) {
-			acct_globals.active = 1;
+			acct->active = 1;
 			printk(KERN_INFO "Process accounting resumed\n");
 		}
 	}
 
-	del_timer(&acct_globals.timer);
-	acct_globals.needcheck = 0;
-	acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
-	add_timer(&acct_globals.timer);
-	res = acct_globals.active;
+	del_timer(&acct->timer);
+	acct->needcheck = 0;
+	acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+	add_timer(&acct->timer);
+	res = acct->active;
 out:
 	spin_unlock(&acct_lock);
 	return res;
@@ -175,34 +176,33 @@ out:
  *
  * NOTE: acct_lock MUST be held on entry and exit.
  */
-static void acct_file_reopen(struct file *file)
+static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file)
 {
 	struct file *old_acct = NULL;
 	struct pid_namespace *old_ns = NULL;
 
-	if (acct_globals.file) {
-		old_acct = acct_globals.file;
-		old_ns = acct_globals.ns;
-		del_timer(&acct_globals.timer);
-		acct_globals.active = 0;
-		acct_globals.needcheck = 0;
-		acct_globals.file = NULL;
+	if (acct->file) {
+		old_acct = acct->file;
+		old_ns = acct->ns;
+		del_timer(&acct->timer);
+		acct->active = 0;
+		acct->needcheck = 0;
+		acct->file = NULL;
 	}
 	if (file) {
-		acct_globals.file = file;
-		acct_globals.ns = get_pid_ns(task_active_pid_ns(current));
-		acct_globals.needcheck = 0;
-		acct_globals.active = 1;
+		acct->file = file;
+		acct->ns = get_pid_ns(task_active_pid_ns(current));
+		acct->needcheck = 0;
+		acct->active = 1;
 		/* It's been deleted if it was used before so this is safe */
-		setup_timer(&acct_globals.timer, acct_timeout,
-				(unsigned long)&acct_globals);
-		acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
-		add_timer(&acct_globals.timer);
+		setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
+		acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+		add_timer(&acct->timer);
 	}
 	if (old_acct) {
 		mnt_unpin(old_acct->f_path.mnt);
 		spin_unlock(&acct_lock);
-		do_acct_process(old_ns, old_acct);
+		do_acct_process(acct, old_ns, old_acct);
 		filp_close(old_acct, NULL);
 		put_pid_ns(old_ns);
 		spin_lock(&acct_lock);
@@ -237,7 +237,7 @@ static int acct_on(char *name)
 
 	spin_lock(&acct_lock);
 	mnt_pin(file->f_path.mnt);
-	acct_file_reopen(file);
+	acct_file_reopen(&acct_globals, file);
 	spin_unlock(&acct_lock);
 
 	mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
@@ -273,7 +273,7 @@ asmlinkage long sys_acct(const char __user *name)
 		error = security_acct(NULL);
 		if (!error) {
 			spin_lock(&acct_lock);
-			acct_file_reopen(NULL);
+			acct_file_reopen(&acct_globals, NULL);
 			spin_unlock(&acct_lock);
 		}
 	}
@@ -291,7 +291,7 @@ void acct_auto_close_mnt(struct vfsmount *m)
 {
 	spin_lock(&acct_lock);
 	if (acct_globals.file && acct_globals.file->f_path.mnt == m)
-		acct_file_reopen(NULL);
+		acct_file_reopen(&acct_globals, NULL);
 	spin_unlock(&acct_lock);
 }
 
@@ -307,7 +307,7 @@ void acct_auto_close(struct super_block *sb)
 	spin_lock(&acct_lock);
 	if (acct_globals.file &&
 	    acct_globals.file->f_path.mnt->mnt_sb == sb) {
-		acct_file_reopen(NULL);
+		acct_file_reopen(&acct_globals, NULL);
 	}
 	spin_unlock(&acct_lock);
 }
@@ -426,7 +426,8 @@ static u32 encode_float(u64 value)
 /*
  *  do_acct_process does all actual work. Caller holds the reference to file.
  */
-static void do_acct_process(struct pid_namespace *ns, struct file *file)
+static void do_acct_process(struct bsd_acct_struct *acct,
+		struct pid_namespace *ns, struct file *file)
 {
 	struct pacct_struct *pacct = &current->signal->pacct;
 	acct_t ac;
@@ -441,7 +442,7 @@ static void do_acct_process(struct pid_namespace *ns, struct file *file)
 	 * First check to see if there is enough free_space to continue
 	 * the process accounting system.
 	 */
-	if (!check_free_space(file))
+	if (!check_free_space(acct, file))
 		return;
 
 	/*
@@ -604,7 +605,7 @@ void acct_process(void)
 	ns = get_pid_ns(acct_globals.ns);
 	spin_unlock(&acct_lock);
 
-	do_acct_process(ns, file);
+	do_acct_process(&acct_globals, ns, file);
 	fput(file);
 	put_pid_ns(ns);
 }
-- 
cgit v1.1


From 0b6b030fc30d169bb406b34b4fc60d99dde4a9c6 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:47 -0700
Subject: bsdacct: switch from global bsd_acct_struct instance to per-pidns one

Allocate the structure on the first call to sys_acct().  After this each
namespace, that ordered the accounting, will live with this structure till
its own death.

Two notes
- routines, that close the accounting on fs umount time use
  the init_pid_ns's acct by now;
- accounting routine accounts to dying task's namespace
  (also by now).

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c          | 84 +++++++++++++++++++++++++++++++++++++++-----------
 kernel/pid_namespace.c |  2 ++
 2 files changed, 68 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 72d4760..febbbc6 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -93,8 +93,6 @@ struct bsd_acct_struct {
 
 static DEFINE_SPINLOCK(acct_lock);
 
-static struct bsd_acct_struct acct_globals __cacheline_aligned;
-
 /*
  * Called whenever the timer says to check the free space.
  */
@@ -176,7 +174,8 @@ out:
  *
  * NOTE: acct_lock MUST be held on entry and exit.
  */
-static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file)
+static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
+		struct pid_namespace *ns)
 {
 	struct file *old_acct = NULL;
 	struct pid_namespace *old_ns = NULL;
@@ -188,10 +187,11 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file)
 		acct->active = 0;
 		acct->needcheck = 0;
 		acct->file = NULL;
+		acct->ns = NULL;
 	}
 	if (file) {
 		acct->file = file;
-		acct->ns = get_pid_ns(task_active_pid_ns(current));
+		acct->ns = ns;
 		acct->needcheck = 0;
 		acct->active = 1;
 		/* It's been deleted if it was used before so this is safe */
@@ -204,7 +204,6 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file)
 		spin_unlock(&acct_lock);
 		do_acct_process(acct, old_ns, old_acct);
 		filp_close(old_acct, NULL);
-		put_pid_ns(old_ns);
 		spin_lock(&acct_lock);
 	}
 }
@@ -213,6 +212,8 @@ static int acct_on(char *name)
 {
 	struct file *file;
 	int error;
+	struct pid_namespace *ns;
+	struct bsd_acct_struct *acct = NULL;
 
 	/* Difference from BSD - they don't do O_APPEND */
 	file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
@@ -229,18 +230,34 @@ static int acct_on(char *name)
 		return -EIO;
 	}
 
+	ns = task_active_pid_ns(current);
+	if (ns->bacct == NULL) {
+		acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+		if (acct == NULL) {
+			filp_close(file, NULL);
+			return -ENOMEM;
+		}
+	}
+
 	error = security_acct(file);
 	if (error) {
+		kfree(acct);
 		filp_close(file, NULL);
 		return error;
 	}
 
 	spin_lock(&acct_lock);
+	if (ns->bacct == NULL) {
+		ns->bacct = acct;
+		acct = NULL;
+	}
+
 	mnt_pin(file->f_path.mnt);
-	acct_file_reopen(&acct_globals, file);
+	acct_file_reopen(ns->bacct, file, ns);
 	spin_unlock(&acct_lock);
 
 	mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
+	kfree(acct);
 
 	return 0;
 }
@@ -270,10 +287,16 @@ asmlinkage long sys_acct(const char __user *name)
 		error = acct_on(tmp);
 		putname(tmp);
 	} else {
+		struct bsd_acct_struct *acct;
+
+		acct = task_active_pid_ns(current)->bacct;
+		if (acct == NULL)
+			return 0;
+
 		error = security_acct(NULL);
 		if (!error) {
 			spin_lock(&acct_lock);
-			acct_file_reopen(&acct_globals, NULL);
+			acct_file_reopen(acct, NULL, NULL);
 			spin_unlock(&acct_lock);
 		}
 	}
@@ -289,9 +312,15 @@ asmlinkage long sys_acct(const char __user *name)
  */
 void acct_auto_close_mnt(struct vfsmount *m)
 {
+	struct bsd_acct_struct *acct;
+
+	acct = init_pid_ns.bacct;
+	if (acct == NULL)
+		return;
+
 	spin_lock(&acct_lock);
-	if (acct_globals.file && acct_globals.file->f_path.mnt == m)
-		acct_file_reopen(&acct_globals, NULL);
+	if (acct->file && acct->file->f_path.mnt == m)
+		acct_file_reopen(acct, NULL, NULL);
 	spin_unlock(&acct_lock);
 }
 
@@ -304,10 +333,29 @@ void acct_auto_close_mnt(struct vfsmount *m)
  */
 void acct_auto_close(struct super_block *sb)
 {
+	struct bsd_acct_struct *acct;
+
+	acct = init_pid_ns.bacct;
+	if (acct == NULL)
+		return;
+
 	spin_lock(&acct_lock);
-	if (acct_globals.file &&
-	    acct_globals.file->f_path.mnt->mnt_sb == sb) {
-		acct_file_reopen(&acct_globals, NULL);
+	if (acct->file && acct->file->f_path.mnt->mnt_sb == sb)
+		acct_file_reopen(acct, NULL, NULL);
+	spin_unlock(&acct_lock);
+}
+
+void acct_exit_ns(struct pid_namespace *ns)
+{
+	struct bsd_acct_struct *acct;
+
+	spin_lock(&acct_lock);
+	acct = ns->bacct;
+	if (acct != NULL) {
+		if (acct->file != NULL)
+			acct_file_reopen(acct, NULL, NULL);
+
+		kfree(acct);
 	}
 	spin_unlock(&acct_lock);
 }
@@ -587,25 +635,25 @@ void acct_collect(long exitcode, int group_dead)
 void acct_process(void)
 {
 	struct file *file = NULL;
-	struct pid_namespace *ns;
+	struct pid_namespace *ns = task_active_pid_ns(current);
+	struct bsd_acct_struct *acct;
 
+	acct = ns->bacct;
 	/*
 	 * accelerate the common fastpath:
 	 */
-	if (!acct_globals.file)
+	if (!acct || !acct->file)
 		return;
 
 	spin_lock(&acct_lock);
-	file = acct_globals.file;
+	file = acct->file;
 	if (unlikely(!file)) {
 		spin_unlock(&acct_lock);
 		return;
 	}
 	get_file(file);
-	ns = get_pid_ns(acct_globals.ns);
 	spin_unlock(&acct_lock);
 
-	do_acct_process(&acct_globals, ns, file);
+	do_acct_process(acct, ns, file);
 	fput(file);
-	put_pid_ns(ns);
 }
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 06331cc..ea567b7 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -12,6 +12,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/syscalls.h>
 #include <linux/err.h>
+#include <linux/acct.h>
 
 #define BITS_PER_PAGE		(PAGE_SIZE*8)
 
@@ -181,6 +182,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 
 	/* Child reaper for the pid namespace is going away */
 	pid_ns->child_reaper = NULL;
+	acct_exit_ns(pid_ns);
 	return;
 }
 
-- 
cgit v1.1


From b5a7174875ea570cc675f2c503e800db8efdd6a7 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:47 -0700
Subject: bsdacct: turn acct off for all pidns-s on umount time

All the bsd_acct_strcts with opened accounting are linked into a global
list.  So, the acct_auto_close(_mnt) walks one and drops the accounting
for each.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index febbbc6..7fc9f9d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -89,9 +89,11 @@ struct bsd_acct_struct {
 	struct file		*file;
 	struct pid_namespace	*ns;
 	struct timer_list	timer;
+	struct list_head	list;
 };
 
 static DEFINE_SPINLOCK(acct_lock);
+static LIST_HEAD(acct_list);
 
 /*
  * Called whenever the timer says to check the free space.
@@ -188,12 +190,14 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
 		acct->needcheck = 0;
 		acct->file = NULL;
 		acct->ns = NULL;
+		list_del(&acct->list);
 	}
 	if (file) {
 		acct->file = file;
 		acct->ns = ns;
 		acct->needcheck = 0;
 		acct->active = 1;
+		list_add(&acct->list, &acct_list);
 		/* It's been deleted if it was used before so this is safe */
 		setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
 		acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
@@ -314,13 +318,13 @@ void acct_auto_close_mnt(struct vfsmount *m)
 {
 	struct bsd_acct_struct *acct;
 
-	acct = init_pid_ns.bacct;
-	if (acct == NULL)
-		return;
-
 	spin_lock(&acct_lock);
-	if (acct->file && acct->file->f_path.mnt == m)
-		acct_file_reopen(acct, NULL, NULL);
+restart:
+	list_for_each_entry(acct, &acct_list, list)
+		if (acct->file && acct->file->f_path.mnt == m) {
+			acct_file_reopen(acct, NULL, NULL);
+			goto restart;
+		}
 	spin_unlock(&acct_lock);
 }
 
@@ -335,13 +339,13 @@ void acct_auto_close(struct super_block *sb)
 {
 	struct bsd_acct_struct *acct;
 
-	acct = init_pid_ns.bacct;
-	if (acct == NULL)
-		return;
-
 	spin_lock(&acct_lock);
-	if (acct->file && acct->file->f_path.mnt->mnt_sb == sb)
-		acct_file_reopen(acct, NULL, NULL);
+restart:
+	list_for_each_entry(acct, &acct_list, list)
+		if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) {
+			acct_file_reopen(acct, NULL, NULL);
+			goto restart;
+		}
 	spin_unlock(&acct_lock);
 }
 
-- 
cgit v1.1


From 7d1e13505be8c2bd2207894f4e0f069e1f9b51c9 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:48 -0700
Subject: bsdacct: account dying tasks in all relevant namespaces

This just makes the acct_proces walk the pid namespaces from current up to
the top and account a task in each with the accounting turned on.

ns->parent access if safe lockless, since current it still alive and holds
its namespace, which in turn holds its parent.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 7fc9f9d..0feba97 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -631,15 +631,9 @@ void acct_collect(long exitcode, int group_dead)
 	spin_unlock_irq(&current->sighand->siglock);
 }
 
-/**
- * acct_process - now just a wrapper around do_acct_process
- *
- * handles process accounting for an exiting task
- */
-void acct_process(void)
+static void acct_process_in_ns(struct pid_namespace *ns)
 {
 	struct file *file = NULL;
-	struct pid_namespace *ns = task_active_pid_ns(current);
 	struct bsd_acct_struct *acct;
 
 	acct = ns->bacct;
@@ -661,3 +655,16 @@ void acct_process(void)
 	do_acct_process(acct, ns, file);
 	fput(file);
 }
+
+/**
+ * acct_process - now just a wrapper around do_acct_process
+ *
+ * handles process accounting for an exiting task
+ */
+void acct_process(void)
+{
+	struct pid_namespace *ns;
+
+	for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
+		acct_process_in_ns(ns);
+}
-- 
cgit v1.1


From 0c18d7a5df82524e634637c3aec24d4cba096442 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:49 -0700
Subject: bsdacct: fix and add comments around acct_process()

Fix the one describing what this function is and add one more - about
locking absence around pid namespaces loop.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 0feba97..dd68b90 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -657,7 +657,8 @@ static void acct_process_in_ns(struct pid_namespace *ns)
 }
 
 /**
- * acct_process - now just a wrapper around do_acct_process
+ * acct_process - now just a wrapper around acct_process_in_ns,
+ * which in turn is a wrapper around do_acct_process.
  *
  * handles process accounting for an exiting task
  */
@@ -665,6 +666,11 @@ void acct_process(void)
 {
 	struct pid_namespace *ns;
 
+	/*
+	 * This loop is safe lockless, since current is still
+	 * alive and holds its namespace, which in turn holds
+	 * its parent.
+	 */
 	for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
 		acct_process_in_ns(ns);
 }
-- 
cgit v1.1


From 297c5d92634c809cef23d73e7b2556f2528ff7e2 Mon Sep 17 00:00:00 2001
From: Andrea Righi <righi.andrea@gmail.com>
Date: Fri, 25 Jul 2008 01:48:49 -0700
Subject: task IO accounting: provide distinct tgid/tid I/O statistics

Report per-thread I/O statistics in /proc/pid/task/tid/io and aggregate
parent I/O statistics in /proc/pid/io.  This approach follows the same
model used to account per-process and per-thread CPU times.

As a practial application, this allows for example to quickly find the top
I/O consumer when a process spawns many child threads that perform the
actual I/O work, because the aggregated I/O statistics can always be found
in /proc/pid/io.

[ Oleg Nesterov points out that we should check that the task is still
  alive before we iterate over the threads, but also says that we can do
  that fixup on top of this later.  - Linus ]

Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Cc: Matt Heaton <matt@hostmonster.com>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Acked-by-with-comments: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 27 +++++++++++++++++++++++++++
 kernel/fork.c |  6 ++++++
 2 files changed, 33 insertions(+)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 8a4d4d1..ad933bb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -120,6 +120,18 @@ static void __exit_signal(struct task_struct *tsk)
 		sig->nivcsw += tsk->nivcsw;
 		sig->inblock += task_io_get_inblock(tsk);
 		sig->oublock += task_io_get_oublock(tsk);
+#ifdef CONFIG_TASK_XACCT
+		sig->rchar += tsk->rchar;
+		sig->wchar += tsk->wchar;
+		sig->syscr += tsk->syscr;
+		sig->syscw += tsk->syscw;
+#endif /* CONFIG_TASK_XACCT */
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+		sig->ioac.read_bytes += tsk->ioac.read_bytes;
+		sig->ioac.write_bytes += tsk->ioac.write_bytes;
+		sig->ioac.cancelled_write_bytes +=
+					tsk->ioac.cancelled_write_bytes;
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
 		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 		sig = NULL; /* Marker for below. */
 	}
@@ -1366,6 +1378,21 @@ static int wait_task_zombie(struct task_struct *p, int options,
 		psig->coublock +=
 			task_io_get_oublock(p) +
 			sig->oublock + sig->coublock;
+#ifdef CONFIG_TASK_XACCT
+		psig->rchar += p->rchar + sig->rchar;
+		psig->wchar += p->wchar + sig->wchar;
+		psig->syscr += p->syscr + sig->syscr;
+		psig->syscw += p->syscw + sig->syscw;
+#endif /* CONFIG_TASK_XACCT */
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+		psig->ioac.read_bytes +=
+			p->ioac.read_bytes + sig->ioac.read_bytes;
+		psig->ioac.write_bytes +=
+			p->ioac.write_bytes + sig->ioac.write_bytes;
+		psig->ioac.cancelled_write_bytes +=
+				p->ioac.cancelled_write_bytes +
+				sig->ioac.cancelled_write_bytes;
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
 		spin_unlock_irq(&p->parent->sighand->siglock);
 	}
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 813d5c8..b99d73e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -812,6 +812,12 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
+#ifdef CONFIG_TASK_XACCT
+	sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0;
+#endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+	memset(&sig->ioac, 0, sizeof(sig->ioac));
+#endif
 	sig->sum_sched_runtime = 0;
 	INIT_LIST_HEAD(&sig->cpu_timers[0]);
 	INIT_LIST_HEAD(&sig->cpu_timers[1]);
-- 
cgit v1.1


From 3e85ba034deec351f02cb55ff225bbd616463841 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 25 Jul 2008 01:48:50 -0700
Subject: tsacct: fix bacct_add_tsk()'s use of do_div()

Fix bacct_add_tsk()'s use of do_div() on an s64 by making ac_etime a u64
instead and dividing that.

Possibly this should be guarded lest the interval calculation turn up
negative, but the possible negativity of the result of the division is
cast away, and it shouldn't end up negative anyway.

This was introduced by patch f3cef7a99469afc159fec3a61b42dc7ca5b6824f.

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/tsacct.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 1da6990..3da47cc 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -28,14 +28,14 @@
 void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 {
 	struct timespec uptime, ts;
-	s64 ac_etime;
+	u64 ac_etime;
 
 	BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
 
 	/* calculate task elapsed time in timespec */
 	do_posix_clock_monotonic_gettime(&uptime);
 	ts = timespec_sub(uptime, tsk->start_time);
-	/* rebase elapsed time to usec */
+	/* rebase elapsed time to usec (should never be negative) */
 	ac_etime = timespec_to_ns(&ts);
 	do_div(ac_etime, NSEC_PER_USEC);
 	stats->ac_etime = ac_etime;
-- 
cgit v1.1


From 873b47717732c2f33a4b14de02571a4295a02f0c Mon Sep 17 00:00:00 2001
From: Keika Kobayashi <kobayashi.kk@ncos.nec.co.jp>
Date: Fri, 25 Jul 2008 01:48:52 -0700
Subject: per-task-delay-accounting: add memory reclaim delay

Sometimes, application responses become bad under heavy memory load.
Applications take a bit time to reclaim memory.  The statistics, how long
memory reclaim takes, will be useful to measure memory usage.

This patch adds accounting memory reclaim to per-task-delay-accounting for
accounting the time of do_try_to_free_pages().

<i.e>

- When System is under low memory load,
  memory reclaim may not occur.

$ free
             total       used       free     shared    buffers     cached
Mem:       8197800    1577300    6620500          0       4808    1516724
-/+ buffers/cache:      55768    8142032
Swap:     16386292          0   16386292

$ vmstat 1
procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
 0  0      0 5069748  10612 3014060    0    0     0     0    3   26  0  0 100  0
 0  0      0 5069748  10612 3014060    0    0     0     0    4   22  0  0 100  0
 0  0      0 5069748  10612 3014060    0    0     0     0    3   18  0  0 100  0

Measure the time of tar command.

$ ls -s test.dat
1501472 test.dat

$ time tar cvf test.tar test.dat
real    0m13.388s
user    0m0.116s
sys     0m5.304s

$ ./delayget -d -p <pid>
CPU             count     real total  virtual total    delay total
                  428     5528345500     5477116080       62749891
IO              count    delay total
                  338     8078977189
SWAP            count    delay total
                    0              0
RECLAIM         count    delay total
                    0              0

- When system is under heavy memory load
  memory reclaim may occur.

$ vmstat 1
procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
 0  0 7159032  49724   1812   3012    0    0     0     0    3   24  0  0 100  0
 0  0 7159032  49724   1812   3012    0    0     0     0    4   24  0  0 100  0
 0  0 7159032  49848   1812   3012    0    0     0     0    3   22  0  0 100  0

In this case, one process uses more 8G memory
by execution of malloc() and memset().

$ time tar cvf test.tar test.dat
real    1m38.563s        <-  increased by 85 sec
user    0m0.140s
sys     0m7.060s

$ ./delayget -d -p <pid>
CPU             count     real total  virtual total    delay total
                 9021     7140446250     7315277975      923201824
IO              count    delay total
                 8965    90466349669
SWAP            count    delay total
                    3       21036367
RECLAIM         count    delay total
                  740    61011951153

In the later case, the value of RECLAIM is increasing.
So, taskstats can show how much memory reclaim influences TAT.

Signed-off-by: Keika Kobayashi <kobayashi.kk@ncos.nec.co.jp>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujistu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/delayacct.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 10e43fd..84b6782 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -165,3 +165,16 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
 	return ret;
 }
 
+void __delayacct_freepages_start(void)
+{
+	delayacct_start(&current->delays->freepages_start);
+}
+
+void __delayacct_freepages_end(void)
+{
+	delayacct_end(&current->delays->freepages_start,
+			&current->delays->freepages_end,
+			&current->delays->freepages_delay,
+			&current->delays->freepages_count);
+}
+
-- 
cgit v1.1


From 016ae219b920c4e606088761d3d6070cdf8ba706 Mon Sep 17 00:00:00 2001
From: Keika Kobayashi <kobayashi.kk@ncos.nec.co.jp>
Date: Fri, 25 Jul 2008 01:48:53 -0700
Subject: per-task-delay-accounting: update taskstats for memory reclaim delay

Add members for memory reclaim delay to taskstats, and accumulate them in
__delayacct_add_tsk() .

Signed-off-by: Keika Kobayashi <kobayashi.kk@ncos.nec.co.jp>
Cc: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/delayacct.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 84b6782..b3179da 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -145,8 +145,11 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
 	tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
 	d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
+	tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
+	d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
 	d->blkio_count += tsk->delays->blkio_count;
 	d->swapin_count += tsk->delays->swapin_count;
+	d->freepages_count += tsk->delays->freepages_count;
 	spin_unlock_irqrestore(&tsk->delays->lock, flags);
 
 done:
-- 
cgit v1.1


From b81f3ea92ba1fa676775677679889dc2a7f03c8b Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Fri, 25 Jul 2008 01:48:55 -0700
Subject: taskstats: remove initialization of static per-cpu variable

Cc: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/taskstats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 06b1754..bd6be76 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -35,7 +35,7 @@
  */
 #define TASKSTATS_CPUMASK_MAXLEN	(100+6*NR_CPUS)
 
-static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
+static DEFINE_PER_CPU(__u32, taskstats_seqnum);
 static int family_registered;
 struct kmem_cache *taskstats_cache;
 
-- 
cgit v1.1


From 9b81361631bbb1d85c99ddec677d42afe516737b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jul 2008 13:02:37 +0200
Subject: signalfd: fix undefined reference to `compat_sys_signalfd4' when
 !CONFIG_SIGNALFD

fix:

arch/x86/ia32/built-in.o: In function `ia32_sys_call_table':
(.rodata+0xa38): undefined reference to `compat_sys_signalfd4'

on !CONFIG_SIGNALFD.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys_ni.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 55eca15..08d6e1b 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -160,6 +160,7 @@ cond_syscall(sys_ioprio_get);
 cond_syscall(sys_signalfd);
 cond_syscall(sys_signalfd4);
 cond_syscall(compat_sys_signalfd);
+cond_syscall(compat_sys_signalfd4);
 cond_syscall(sys_timerfd_create);
 cond_syscall(sys_timerfd_settime);
 cond_syscall(sys_timerfd_gettime);
-- 
cgit v1.1


From 1fe371044b21b226b96a9dd959e971b50b28c78e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 26 Jul 2008 15:09:47 +0200
Subject: ftrace: fix modular build

fix:

 ERROR: "start_critical_timings" [drivers/acpi/processor.ko] undefined!
 ERROR: "stop_critical_timings" [drivers/acpi/processor.ko] undefined!

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_irqsoff.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index b1e4a89..ece6cfb 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -253,12 +253,14 @@ void start_critical_timings(void)
 	if (preempt_trace() || irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
+EXPORT_SYMBOL_GPL(start_critical_timings);
 
 void stop_critical_timings(void)
 {
 	if (preempt_trace() || irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
+EXPORT_SYMBOL_GPL(stop_critical_timings);
 
 #ifdef CONFIG_IRQSOFF_TRACER
 #ifdef CONFIG_PROVE_LOCKING
-- 
cgit v1.1


From b8d317d10cca76cabe6b03ebfeb23cc99118b731 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Thu, 24 Jul 2008 18:21:29 -0700
Subject: cpumask: make cpumask_of_cpu_map generic

If an arch doesn't define cpumask_of_cpu_map, create a generic
statically-initialized one for them.  This allows removal of the buggy
cpumask_of_cpu() macro (&cpumask_of_cpu() gives address of
out-of-scope var).

An arch with NR_CPUS of 4096 probably wants to allocate this itself
based on the actual number of CPUs, since otherwise they're using 2MB
of rodata (1024 cpus means 128k).  That's what
CONFIG_HAVE_CPUMASK_OF_CPU_MAP is for (only x86/64 does so at the
moment).

In future as we support more CPUs, we'll need to resort to a
get_cpu_map()/put_cpu_map() allocation scheme.

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpu.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 10ba5f1..fe31ff3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -461,3 +461,112 @@ out:
 #endif /* CONFIG_PM_SLEEP_SMP */
 
 #endif /* CONFIG_SMP */
+
+#ifndef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
+/* 64 bits of zeros, for initializers. */
+#if BITS_PER_LONG == 32
+#define Z64 0, 0
+#else
+#define Z64 0
+#endif
+
+/* Initializer macros. */
+#define CMI0(n) { .bits = { 1UL << (n) } }
+#define CMI(n, ...) { .bits = { __VA_ARGS__, 1UL << ((n) % BITS_PER_LONG) } }
+
+#define CMI8(n, ...)						\
+	CMI((n), __VA_ARGS__), CMI((n)+1, __VA_ARGS__),		\
+	CMI((n)+2, __VA_ARGS__), CMI((n)+3, __VA_ARGS__),	\
+	CMI((n)+4, __VA_ARGS__), CMI((n)+5, __VA_ARGS__),	\
+	CMI((n)+6, __VA_ARGS__), CMI((n)+7, __VA_ARGS__)
+
+#if BITS_PER_LONG == 32
+#define CMI64(n, ...)							\
+	CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__),		\
+	CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__),		\
+	CMI8((n)+32, 0, __VA_ARGS__), CMI8((n)+40, 0, __VA_ARGS__),	\
+	CMI8((n)+48, 0, __VA_ARGS__), CMI8((n)+56, 0, __VA_ARGS__)
+#else
+#define CMI64(n, ...)							\
+	CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__),		\
+	CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__),		\
+	CMI8((n)+32, __VA_ARGS__), CMI8((n)+40, __VA_ARGS__),	\
+	CMI8((n)+48, __VA_ARGS__), CMI8((n)+56, __VA_ARGS__)
+#endif
+
+#define CMI256(n, ...)							\
+	CMI64((n), __VA_ARGS__), CMI64((n)+64, Z64, __VA_ARGS__),	\
+	CMI64((n)+128, Z64, Z64, __VA_ARGS__),				\
+	CMI64((n)+192, Z64, Z64, Z64, __VA_ARGS__)
+#define Z256 Z64, Z64, Z64, Z64
+
+#define CMI1024(n, ...)					\
+	CMI256((n), __VA_ARGS__),			\
+	CMI256((n)+256, Z256, __VA_ARGS__),		\
+	CMI256((n)+512, Z256, Z256, __VA_ARGS__),	\
+	CMI256((n)+768, Z256, Z256, Z256, __VA_ARGS__)
+#define Z1024 Z256, Z256, Z256, Z256
+
+/* We want this statically initialized, just to be safe.  We try not
+ * to waste too much space, either. */
+static const cpumask_t cpumask_map[] = {
+	CMI0(0), CMI0(1), CMI0(2), CMI0(3),
+#if NR_CPUS > 4
+	CMI0(4), CMI0(5), CMI0(6), CMI0(7),
+#endif
+#if NR_CPUS > 8
+	CMI0(8), CMI0(9), CMI0(10), CMI0(11),
+	CMI0(12), CMI0(13), CMI0(14), CMI0(15),
+#endif
+#if NR_CPUS > 16
+	CMI0(16), CMI0(17), CMI0(18), CMI0(19),
+	CMI0(20), CMI0(21), CMI0(22), CMI0(23),
+	CMI0(24), CMI0(25), CMI0(26), CMI0(27),
+	CMI0(28), CMI0(29), CMI0(30), CMI0(31),
+#endif
+#if NR_CPUS > 32
+#if BITS_PER_LONG == 32
+	CMI(32, 0), CMI(33, 0), CMI(34, 0), CMI(35, 0),
+	CMI(36, 0), CMI(37, 0), CMI(38, 0), CMI(39, 0),
+	CMI(40, 0), CMI(41, 0), CMI(42, 0), CMI(43, 0),
+	CMI(44, 0), CMI(45, 0), CMI(46, 0), CMI(47, 0),
+	CMI(48, 0), CMI(49, 0), CMI(50, 0), CMI(51, 0),
+	CMI(52, 0), CMI(53, 0), CMI(54, 0), CMI(55, 0),
+	CMI(56, 0), CMI(57, 0), CMI(58, 0), CMI(59, 0),
+	CMI(60, 0), CMI(61, 0), CMI(62, 0), CMI(63, 0),
+#else
+	CMI0(32), CMI0(33), CMI0(34), CMI0(35),
+	CMI0(36), CMI0(37), CMI0(38), CMI0(39),
+	CMI0(40), CMI0(41), CMI0(42), CMI0(43),
+	CMI0(44), CMI0(45), CMI0(46), CMI0(47),
+	CMI0(48), CMI0(49), CMI0(50), CMI0(51),
+	CMI0(52), CMI0(53), CMI0(54), CMI0(55),
+	CMI0(56), CMI0(57), CMI0(58), CMI0(59),
+	CMI0(60), CMI0(61), CMI0(62), CMI0(63),
+#endif /* BITS_PER_LONG == 64 */
+#endif
+#if NR_CPUS > 64
+	CMI64(64, Z64),
+#endif
+#if NR_CPUS > 128
+	CMI64(128, Z64, Z64), CMI64(192, Z64, Z64, Z64),
+#endif
+#if NR_CPUS > 256
+	CMI256(256, Z256),
+#endif
+#if NR_CPUS > 512
+	CMI256(512, Z256, Z256), CMI256(768, Z256, Z256, Z256),
+#endif
+#if NR_CPUS > 1024
+	CMI1024(1024, Z1024),
+#endif
+#if NR_CPUS > 2048
+	CMI1024(2048, Z1024, Z1024), CMI1024(3072, Z1024, Z1024, Z1024),
+#endif
+#if NR_CPUS > 4096
+#error NR_CPUS too big.  Fix initializers or set CONFIG_HAVE_CPUMASK_OF_CPU_MAP
+#endif
+};
+
+const cpumask_t *cpumask_of_cpu_map = cpumask_map;
+#endif /* !CONFIG_HAVE_CPUMASK_OF_CPU_MAP */
-- 
cgit v1.1


From 6524d938b3360504b43a1278b5a8403e85383d1a Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Thu, 24 Jul 2008 18:21:30 -0700
Subject: cpumask: put cpumask_of_cpu_map in the initdata section

  * Create the cpumask_of_cpu_map statically in the init data section
    using NR_CPUS but replace it during boot up with one sized by
    nr_cpu_ids (num possible cpus).

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpu.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index fe31ff3..9d4e1c2 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -462,7 +462,6 @@ out:
 
 #endif /* CONFIG_SMP */
 
-#ifndef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
 /* 64 bits of zeros, for initializers. */
 #if BITS_PER_LONG == 32
 #define Z64 0, 0
@@ -509,7 +508,11 @@ out:
 
 /* We want this statically initialized, just to be safe.  We try not
  * to waste too much space, either. */
-static const cpumask_t cpumask_map[] = {
+static const cpumask_t cpumask_map[]
+#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
+__initdata
+#endif
+= {
 	CMI0(0), CMI0(1), CMI0(2), CMI0(3),
 #if NR_CPUS > 4
 	CMI0(4), CMI0(5), CMI0(6), CMI0(7),
@@ -569,4 +572,3 @@ static const cpumask_t cpumask_map[] = {
 };
 
 const cpumask_t *cpumask_of_cpu_map = cpumask_map;
-#endif /* !CONFIG_HAVE_CPUMASK_OF_CPU_MAP */
-- 
cgit v1.1


From 0bc3cc03fa6e1c20aecb5a33356bcaae410640b9 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Thu, 24 Jul 2008 18:21:31 -0700
Subject: cpumask: change cpumask_of_cpu_ptr to use new cpumask_of_cpu

  * Replace previous instances of the cpumask_of_cpu_ptr* macros
    with a the new (lvalue capable) generic cpumask_of_cpu().

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/stop_machine.c        | 3 +--
 kernel/time/tick-common.c    | 8 +++-----
 kernel/trace/trace_sysprof.c | 4 +---
 3 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 738b411..ba9b205 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -33,9 +33,8 @@ static int stopmachine(void *cpu)
 {
 	int irqs_disabled = 0;
 	int prepared = 0;
-	cpumask_of_cpu_ptr(cpumask, (int)(long)cpu);
 
-	set_cpus_allowed_ptr(current, cpumask);
+	set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu));
 
 	/* Ack: we are alive */
 	smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index bf43284..80c4336 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -196,12 +196,10 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 	struct tick_device *td;
 	int cpu, ret = NOTIFY_OK;
 	unsigned long flags;
-	cpumask_of_cpu_ptr_declare(cpumask);
 
 	spin_lock_irqsave(&tick_device_lock, flags);
 
 	cpu = smp_processor_id();
-	cpumask_of_cpu_ptr_next(cpumask, cpu);
 	if (!cpu_isset(cpu, newdev->cpumask))
 		goto out_bc;
 
@@ -209,7 +207,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 	curdev = td->evtdev;
 
 	/* cpu local device ? */
-	if (!cpus_equal(newdev->cpumask, *cpumask)) {
+	if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) {
 
 		/*
 		 * If the cpu affinity of the device interrupt can not
@@ -222,7 +220,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 		 * If we have a cpu local device already, do not replace it
 		 * by a non cpu local device
 		 */
-		if (curdev && cpus_equal(curdev->cpumask, *cpumask))
+		if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu)))
 			goto out_bc;
 	}
 
@@ -254,7 +252,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 		curdev = NULL;
 	}
 	clockevents_exchange_device(curdev, newdev);
-	tick_setup_device(td, newdev, cpu, cpumask);
+	tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu));
 	if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
 		tick_oneshot_notify();
 
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index ce2d723..bb948e5 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -213,9 +213,7 @@ static void start_stack_timers(void)
 	int cpu;
 
 	for_each_online_cpu(cpu) {
-		cpumask_of_cpu_ptr(new_mask, cpu);
-
-		set_cpus_allowed_ptr(current, new_mask);
+		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
 		start_stack_timer(cpu);
 	}
 	set_cpus_allowed_ptr(current, &saved_mask);
-- 
cgit v1.1


From 5a7a201c51c324876d00a54e7208af6af12d1ca4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 26 Jul 2008 16:50:47 +0200
Subject: cpumask: export cpumask_of_cpu_map

fix:

 ERROR: "cpumask_of_cpu_map" [drivers/acpi/processor.ko] undefined!
 ERROR: "cpumask_of_cpu_map" [arch/x86/kernel/microcode.ko] undefined!
 ERROR: "cpumask_of_cpu_map" [arch/x86/kernel/cpu/cpufreq/speedstep-ich.ko] undefined!
 ERROR: "cpumask_of_cpu_map" [arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.ko] undefined!

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpu.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9d4e1c2..a35d899 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -572,3 +572,5 @@ __initdata
 };
 
 const cpumask_t *cpumask_of_cpu_map = cpumask_map;
+
+EXPORT_SYMBOL_GPL(cpumask_of_cpu_map);
-- 
cgit v1.1


From a2e2e3577c3ef2b5dbb866e97e612aae4adfa32f Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Fri, 25 Jul 2008 19:44:38 -0700
Subject: pm selftest: rtc paranoia

Cope with a quirk of some RTCs (notably ACPI ones) which aren't guaranteed
to implement oneshot behavior when they woke the system from sleeep:
forcibly disable the alarm, just in case.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/main.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 95bff23..0b7476f 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -635,6 +635,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
 	}
 	if (status < 0)
 		printk(err_suspend, status);
+
+	/* Some platforms can't detect that the alarm triggered the
+	 * wakeup, or (accordingly) disable it after it afterwards.
+	 * It's supposed to give oneshot behavior; cope.
+	 */
+	alm.enabled = false;
+	rtc_set_alarm(rtc, &alm);
 }
 
 static int __init has_wakealarm(struct device *dev, void *name_ptr)
-- 
cgit v1.1


From 7fccf0326536c1b245b98740d489abb9aab69a12 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Fri, 25 Jul 2008 19:45:02 -0700
Subject: kernel/kexec.c: make 'kimage_terminate' void

Since kimage_terminate() always returns 0, make it void.

Signed-off-by: WANG Cong <wangcong@zeuux.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 1c5fcac..6db42ff 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -589,14 +589,12 @@ static void kimage_free_extra_pages(struct kimage *image)
 	kimage_free_page_list(&image->unuseable_pages);
 
 }
-static int kimage_terminate(struct kimage *image)
+static void kimage_terminate(struct kimage *image)
 {
 	if (*image->entry != 0)
 		image->entry++;
 
 	*image->entry = IND_DONE;
-
-	return 0;
 }
 
 #define for_each_kimage_entry(image, ptr, entry) \
@@ -997,9 +995,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
 			if (result)
 				goto out;
 		}
-		result = kimage_terminate(image);
-		if (result)
-			goto out;
+		kimage_terminate(image);
 	}
 	/* Install the new kernel, and  Uninstall the old */
 	image = xchg(dest_image, image);
-- 
cgit v1.1


From 3ab83521378268044a448113c6aa9a9e245f4d2f Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 25 Jul 2008 19:45:07 -0700
Subject: kexec jump

This patch provides an enhancement to kexec/kdump.  It implements the
following features:

- Backup/restore memory used by the original kernel before/after
  kexec.

- Save/restore CPU state before/after kexec.

The features of this patch can be used as a general method to call program in
physical mode (paging turning off).  This can be used to call BIOS code under
Linux.

kexec-tools needs to be patched to support kexec jump. The patches and
the precompiled kexec can be download from the following URL:

       source: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-src_git_kh10.tar.bz2
       patches: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-patches_git_kh10.tar.bz2
       binary: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec_git_kh10

Usage example of calling some physical mode code and return:

1. Compile and install patched kernel with following options selected:

CONFIG_X86_32=y
CONFIG_KEXEC=y
CONFIG_PM=y
CONFIG_KEXEC_JUMP=y

2. Build patched kexec-tool or download the pre-built one.

3. Build some physical mode executable named such as "phy_mode"

4. Boot kernel compiled in step 1.

5. Load physical mode executable with /sbin/kexec. The shell command
   line can be as follow:

   /sbin/kexec --load-preserve-context --args-none phy_mode

6. Call physical mode executable with following shell command line:

   /sbin/kexec -e

Implementation point:

To support jumping without reserving memory.  One shadow backup page (source
page) is allocated for each page used by kexeced code image (destination
page).  When do kexec_load, the image of kexeced code is loaded into source
pages, and before executing, the destination pages and the source pages are
swapped, so the contents of destination pages are backupped.  Before jumping
to the kexeced code image and after jumping back to the original kernel, the
destination pages and the source pages are swapped too.

C ABI (calling convention) is used as communication protocol between
kernel and called code.

A flag named KEXEC_PRESERVE_CONTEXT for sys_kexec_load is added to
indicate that the loaded kernel image is used for jumping back.

Now, only the i386 architecture is supported.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sys.c   | 31 ++++++++-----------------------
 2 files changed, 65 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 6db42ff..a0d9209 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -24,6 +24,8 @@
 #include <linux/utsrelease.h>
 #include <linux/utsname.h>
 #include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -242,6 +244,12 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
 		goto out;
 	}
 
+	image->swap_page = kimage_alloc_control_pages(image, 0);
+	if (!image->swap_page) {
+		printk(KERN_ERR "Could not allocate swap buffer\n");
+		goto out;
+	}
+
 	result = 0;
  out:
 	if (result == 0)
@@ -986,6 +994,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
 		if (result)
 			goto out;
 
+		if (flags & KEXEC_PRESERVE_CONTEXT)
+			image->preserve_context = 1;
 		result = machine_kexec_prepare(image);
 		if (result)
 			goto out;
@@ -1411,3 +1421,50 @@ static int __init crash_save_vmcoreinfo_init(void)
 }
 
 module_init(crash_save_vmcoreinfo_init)
+
+/**
+ *	kernel_kexec - reboot the system
+ *
+ *	Move into place and start executing a preloaded standalone
+ *	executable.  If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+	int error = 0;
+
+	if (xchg(&kexec_lock, 1))
+		return -EBUSY;
+	if (!kexec_image) {
+		error = -EINVAL;
+		goto Unlock;
+	}
+
+	if (kexec_image->preserve_context) {
+#ifdef CONFIG_KEXEC_JUMP
+		local_irq_disable();
+		save_processor_state();
+#endif
+	} else {
+		blocking_notifier_call_chain(&reboot_notifier_list,
+					     SYS_RESTART, NULL);
+		system_state = SYSTEM_RESTART;
+		device_shutdown();
+		sysdev_shutdown();
+		printk(KERN_EMERG "Starting new kernel\n");
+		machine_shutdown();
+	}
+
+	machine_kexec(kexec_image);
+
+	if (kexec_image->preserve_context) {
+#ifdef CONFIG_KEXEC_JUMP
+		restore_processor_state();
+		local_irq_enable();
+#endif
+	}
+
+ Unlock:
+	xchg(&kexec_lock, 0);
+
+	return error;
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index 0c9d3fa..c0185809 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -301,26 +301,6 @@ void kernel_restart(char *cmd)
 }
 EXPORT_SYMBOL_GPL(kernel_restart);
 
-/**
- *	kernel_kexec - reboot the system
- *
- *	Move into place and start executing a preloaded standalone
- *	executable.  If nothing was preloaded return an error.
- */
-static void kernel_kexec(void)
-{
-#ifdef CONFIG_KEXEC
-	struct kimage *image;
-	image = xchg(&kexec_image, NULL);
-	if (!image)
-		return;
-	kernel_restart_prepare(NULL);
-	printk(KERN_EMERG "Starting new kernel\n");
-	machine_shutdown();
-	machine_kexec(image);
-#endif
-}
-
 static void kernel_shutdown_prepare(enum system_states state)
 {
 	blocking_notifier_call_chain(&reboot_notifier_list,
@@ -425,10 +405,15 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 		kernel_restart(buffer);
 		break;
 
+#ifdef CONFIG_KEXEC
 	case LINUX_REBOOT_CMD_KEXEC:
-		kernel_kexec();
-		unlock_kernel();
-		return -EINVAL;
+		{
+			int ret;
+			ret = kernel_kexec();
+			unlock_kernel();
+			return ret;
+		}
+#endif
 
 #ifdef CONFIG_HIBERNATION
 	case LINUX_REBOOT_CMD_SW_SUSPEND:
-- 
cgit v1.1


From 89081d17f7bb81d89fa1aa9b70f821c5cf4d39e9 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 25 Jul 2008 19:45:10 -0700
Subject: kexec jump: save/restore device state

This patch implements devices state save/restore before after kexec.

This patch together with features in kexec_jump patch can be used for
following:

- A simple hibernation implementation without ACPI support.  You can kexec a
  hibernating kernel, save the memory image of original system and shutdown
  the system.  When resuming, you restore the memory image of original system
  via ordinary kexec load then jump back.

- Kernel/system debug through making system snapshot.  You can make system
  snapshot, jump back, do some thing and make another system snapshot.

- Cooperative multi-kernel/system.  With kexec jump, you can switch between
  several kernels/systems quickly without boot process except the first time.
  This appears like swap a whole kernel/system out/in.

- A general method to call program in physical mode (paging turning
  off). This can be used to invoke BIOS code under Linux.

The following user-space tools can be used with kexec jump:

- kexec-tools needs to be patched to support kexec jump. The patches
  and the precompiled kexec can be download from the following URL:
       source: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-src_git_kh10.tar.bz2
       patches: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-patches_git_kh10.tar.bz2
       binary: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec_git_kh10

- makedumpfile with patches are used as memory image saving tool, it
  can exclude free pages from original kernel memory image file. The
  patches and the precompiled makedumpfile can be download from the
  following URL:
       source: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile-src_cvs_kh10.tar.bz2
       patches: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile-patches_cvs_kh10.tar.bz2
       binary: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile_cvs_kh10

- An initramfs image can be used as the root file system of kexeced
  kernel. An initramfs image built with "BuildRoot" can be downloaded
  from the following URL:
       initramfs image: http://khibernation.sourceforge.net/download/release_v10/initramfs/rootfs_cvs_kh10.gz
  All user space tools above are included in the initramfs image.

Usage example of simple hibernation:

1. Compile and install patched kernel with following options selected:

CONFIG_X86_32=y
CONFIG_RELOCATABLE=y
CONFIG_KEXEC=y
CONFIG_CRASH_DUMP=y
CONFIG_PM=y
CONFIG_HIBERNATION=y
CONFIG_KEXEC_JUMP=y

2. Build an initramfs image contains kexec-tool and makedumpfile, or
   download the pre-built initramfs image, called rootfs.gz in
   following text.

3. Prepare a partition to save memory image of original kernel, called
   hibernating partition in following text.

4. Boot kernel compiled in step 1 (kernel A).

5. In the kernel A, load kernel compiled in step 1 (kernel B) with
   /sbin/kexec. The shell command line can be as follow:

   /sbin/kexec --load-preserve-context /boot/bzImage --mem-min=0x100000
     --mem-max=0xffffff --initrd=rootfs.gz

6. Boot the kernel B with following shell command line:

   /sbin/kexec -e

7. The kernel B will boot as normal kexec. In kernel B the memory
   image of kernel A can be saved into hibernating partition as
   follow:

   jump_back_entry=`cat /proc/cmdline | tr ' ' '\n' | grep kexec_jump_back_entry | cut -d '='`
   echo $jump_back_entry > kexec_jump_back_entry
   cp /proc/vmcore dump.elf

   Then you can shutdown the machine as normal.

8. Boot kernel compiled in step 1 (kernel C). Use the rootfs.gz as
   root file system.

9. In kernel C, load the memory image of kernel A as follow:

   /sbin/kexec -l --args-none --entry=`cat kexec_jump_back_entry` dump.elf

10. Jump back to the kernel A as follow:

   /sbin/kexec -e

   Then, kernel A is resumed.

Implementation point:

To support jumping between two kernels, before jumping to (executing)
the new kernel and jumping back to the original kernel, the devices
are put into quiescent state, and the state of devices and CPU is
saved. After jumping back from kexeced kernel and jumping to the new
kernel, the state of devices and CPU are restored accordingly. The
devices/CPU state save/restore code of software suspend is called to
implement corresponding function.

Known issues:

- Because the segment number supported by sys_kexec_load is limited,
  hibernation image with many segments may not be load. This is
  planned to be eliminated by adding a new flag to sys_kexec_load to
  make a image can be loaded with multiple sys_kexec_load invoking.

Now, only the i386 architecture is supported.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c       | 39 +++++++++++++++++++++++++++++++++++++++
 kernel/power/power.h |  2 --
 2 files changed, 39 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index a0d9209..c8a4370 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -26,6 +26,10 @@
 #include <linux/numa.h>
 #include <linux/suspend.h>
 #include <linux/device.h>
+#include <linux/freezer.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/console.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -1441,7 +1445,31 @@ int kernel_kexec(void)
 
 	if (kexec_image->preserve_context) {
 #ifdef CONFIG_KEXEC_JUMP
+		mutex_lock(&pm_mutex);
+		pm_prepare_console();
+		error = freeze_processes();
+		if (error) {
+			error = -EBUSY;
+			goto Restore_console;
+		}
+		suspend_console();
+		error = device_suspend(PMSG_FREEZE);
+		if (error)
+			goto Resume_console;
+		error = disable_nonboot_cpus();
+		if (error)
+			goto Resume_devices;
 		local_irq_disable();
+		/* At this point, device_suspend() has been called,
+		 * but *not* device_power_down(). We *must*
+		 * device_power_down() now.  Otherwise, drivers for
+		 * some devices (e.g. interrupt controllers) become
+		 * desynchronized with the actual state of the
+		 * hardware at resume time, and evil weirdness ensues.
+		 */
+		error = device_power_down(PMSG_FREEZE);
+		if (error)
+			goto Enable_irqs;
 		save_processor_state();
 #endif
 	} else {
@@ -1459,7 +1487,18 @@ int kernel_kexec(void)
 	if (kexec_image->preserve_context) {
 #ifdef CONFIG_KEXEC_JUMP
 		restore_processor_state();
+		device_power_up(PMSG_RESTORE);
+ Enable_irqs:
 		local_irq_enable();
+		enable_nonboot_cpus();
+ Resume_devices:
+		device_resume(PMSG_RESTORE);
+ Resume_console:
+		resume_console();
+		thaw_processes();
+ Restore_console:
+		pm_restore_console();
+		mutex_unlock(&pm_mutex);
 #endif
 	}
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 700f44e..acc0c10 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -53,8 +53,6 @@ extern int hibernation_platform_enter(void);
 
 extern int pfn_is_nosave(unsigned long);
 
-extern struct mutex pm_mutex;
-
 #define power_attr(_name) \
 static struct kobj_attribute _name##_attr = {	\
 	.attr	= {				\
-- 
cgit v1.1


From 7babe8db99d305340cf4828ce1f5a1481d5622ef Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Fri, 25 Jul 2008 19:45:11 -0700
Subject: Full conversion to early_initcall() interface, remove old interface

A previous patch added the early_initcall(), to allow a cleaner hooking of
pre-SMP initcalls.  Now we remove the older interface, converting all
existing users to the new one.

[akpm@linux-foundation.org: cleanups]
[akpm@linux-foundation.org: build fix]
[kosaki.motohiro@jp.fujitsu.com: warning fix]
[kosaki.motohiro@jp.fujitsu.com: warning fix]
Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c      |  5 ++++-
 kernel/smp.c        |  4 +++-
 kernel/softirq.c    |  3 ++-
 kernel/softlockup.c | 25 ++++++++++++++++++++++---
 4 files changed, 31 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0047bd9..fde1a10 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6389,7 +6389,7 @@ static struct notifier_block __cpuinitdata migration_notifier = {
 	.priority = 10
 };
 
-void __init migration_init(void)
+static int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err;
@@ -6399,7 +6399,10 @@ void __init migration_init(void)
 	BUG_ON(err == NOTIFY_BAD);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
+
+	return err;
 }
+early_initcall(migration_init);
 #endif
 
 #ifdef CONFIG_SMP
diff --git a/kernel/smp.c b/kernel/smp.c
index 462c785..96fc7c0 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -33,7 +33,7 @@ struct call_single_queue {
 	spinlock_t lock;
 };
 
-void __cpuinit init_call_single_data(void)
+static int __cpuinit init_call_single_data(void)
 {
 	int i;
 
@@ -43,7 +43,9 @@ void __cpuinit init_call_single_data(void)
 		spin_lock_init(&q->lock);
 		INIT_LIST_HEAD(&q->list);
 	}
+	return 0;
 }
+early_initcall(init_call_single_data);
 
 static void csd_flag_wait(struct call_single_data *data)
 {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f6b03d5..c506f26 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -630,7 +630,7 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
-__init int spawn_ksoftirqd(void)
+static __init int spawn_ksoftirqd(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
@@ -640,6 +640,7 @@ __init int spawn_ksoftirqd(void)
 	register_cpu_notifier(&cpu_nfb);
 	return 0;
 }
+early_initcall(spawn_ksoftirqd);
 
 #ifdef CONFIG_SMP
 /*
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 7bd8d1a..b75b492 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -338,14 +338,33 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
-__init void spawn_softlockup_task(void)
+static int __initdata nosoftlockup;
+
+static int __init nosoftlockup_setup(char *str)
+{
+	nosoftlockup = 1;
+	return 1;
+}
+__setup("nosoftlockup", nosoftlockup_setup);
+
+static int __init spawn_softlockup_task(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
-	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	int err;
 
-	BUG_ON(err == NOTIFY_BAD);
+	if (nosoftlockup)
+		return 0;
+
+	err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	if (err == NOTIFY_BAD) {
+		BUG();
+		return 1;
+	}
 	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 	register_cpu_notifier(&cpu_nfb);
 
 	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+
+	return 0;
 }
+early_initcall(spawn_softlockup_task);
-- 
cgit v1.1


From 20d8b67c06fa5e74f44e80b0a0fd68c8327f7c6a Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Fri, 25 Jul 2008 19:45:12 -0700
Subject: relay: add buffer-only channels; useful for early logging

Allows one to create and use a channel with no associated files.  Files
can be initialized later.  This is useful in scenarios such as logging in
early code, before VFS is up.  Therefore, such channels can be created and
used as soon as kmem_cache_init() completed.

This is needed by kmemtrace to do tracing in early kernel code.

[kosaki.motohiro@jp.fujitsu.com: build fix]
Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/relay.c | 170 +++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 141 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 7de644c..04006ef 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -407,6 +407,35 @@ void relay_reset(struct rchan *chan)
 }
 EXPORT_SYMBOL_GPL(relay_reset);
 
+static inline void relay_set_buf_dentry(struct rchan_buf *buf,
+					struct dentry *dentry)
+{
+	buf->dentry = dentry;
+	buf->dentry->d_inode->i_size = buf->early_bytes;
+}
+
+static struct dentry *relay_create_buf_file(struct rchan *chan,
+					    struct rchan_buf *buf,
+					    unsigned int cpu)
+{
+	struct dentry *dentry;
+	char *tmpname;
+
+	tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
+	if (!tmpname)
+		return NULL;
+	snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
+
+	/* Create file in fs */
+	dentry = chan->cb->create_buf_file(tmpname, chan->parent,
+					   S_IRUSR, buf,
+					   &chan->is_global);
+
+	kfree(tmpname);
+
+	return dentry;
+}
+
 /*
  *	relay_open_buf - create a new relay channel buffer
  *
@@ -416,45 +445,34 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
 {
  	struct rchan_buf *buf = NULL;
 	struct dentry *dentry;
- 	char *tmpname;
 
  	if (chan->is_global)
 		return chan->buf[0];
 
-	tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
- 	if (!tmpname)
- 		goto end;
- 	snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
-
 	buf = relay_create_buf(chan);
 	if (!buf)
- 		goto free_name;
+		return NULL;
+
+	if (chan->has_base_filename) {
+		dentry = relay_create_buf_file(chan, buf, cpu);
+		if (!dentry)
+			goto free_buf;
+		relay_set_buf_dentry(buf, dentry);
+	}
 
  	buf->cpu = cpu;
  	__relay_reset(buf, 1);
 
-	/* Create file in fs */
- 	dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR,
- 					   buf, &chan->is_global);
- 	if (!dentry)
- 		goto free_buf;
-
-	buf->dentry = dentry;
-
  	if(chan->is_global) {
  		chan->buf[0] = buf;
  		buf->cpu = 0;
   	}
 
- 	goto free_name;
+	return buf;
 
 free_buf:
  	relay_destroy_buf(buf);
- 	buf = NULL;
-free_name:
- 	kfree(tmpname);
-end:
-	return buf;
+	return NULL;
 }
 
 /**
@@ -537,8 +555,8 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
 
 /**
  *	relay_open - create a new relay channel
- *	@base_filename: base name of files to create
- *	@parent: dentry of parent directory, %NULL for root directory
+ *	@base_filename: base name of files to create, %NULL for buffering only
+ *	@parent: dentry of parent directory, %NULL for root directory or buffer
  *	@subbuf_size: size of sub-buffers
  *	@n_subbufs: number of sub-buffers
  *	@cb: client callback functions
@@ -560,8 +578,6 @@ struct rchan *relay_open(const char *base_filename,
 {
 	unsigned int i;
 	struct rchan *chan;
-	if (!base_filename)
-		return NULL;
 
 	if (!(subbuf_size && n_subbufs))
 		return NULL;
@@ -576,7 +592,10 @@ struct rchan *relay_open(const char *base_filename,
 	chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
 	chan->parent = parent;
 	chan->private_data = private_data;
-	strlcpy(chan->base_filename, base_filename, NAME_MAX);
+	if (base_filename) {
+		chan->has_base_filename = 1;
+		strlcpy(chan->base_filename, base_filename, NAME_MAX);
+	}
 	setup_callbacks(chan, cb);
 	kref_init(&chan->kref);
 
@@ -604,6 +623,94 @@ free_bufs:
 }
 EXPORT_SYMBOL_GPL(relay_open);
 
+struct rchan_percpu_buf_dispatcher {
+	struct rchan_buf *buf;
+	struct dentry *dentry;
+};
+
+/* Called in atomic context. */
+static void __relay_set_buf_dentry(void *info)
+{
+	struct rchan_percpu_buf_dispatcher *p = info;
+
+	relay_set_buf_dentry(p->buf, p->dentry);
+}
+
+/**
+ *	relay_late_setup_files - triggers file creation
+ *	@chan: channel to operate on
+ *	@base_filename: base name of files to create
+ *	@parent: dentry of parent directory, %NULL for root directory
+ *
+ *	Returns 0 if successful, non-zero otherwise.
+ *
+ *	Use to setup files for a previously buffer-only channel.
+ *	Useful to do early tracing in kernel, before VFS is up, for example.
+ */
+int relay_late_setup_files(struct rchan *chan,
+			   const char *base_filename,
+			   struct dentry *parent)
+{
+	int err = 0;
+	unsigned int i, curr_cpu;
+	unsigned long flags;
+	struct dentry *dentry;
+	struct rchan_percpu_buf_dispatcher disp;
+
+	if (!chan || !base_filename)
+		return -EINVAL;
+
+	strlcpy(chan->base_filename, base_filename, NAME_MAX);
+
+	mutex_lock(&relay_channels_mutex);
+	/* Is chan already set up? */
+	if (unlikely(chan->has_base_filename))
+		return -EEXIST;
+	chan->has_base_filename = 1;
+	chan->parent = parent;
+	curr_cpu = get_cpu();
+	/*
+	 * The CPU hotplug notifier ran before us and created buffers with
+	 * no files associated. So it's safe to call relay_setup_buf_file()
+	 * on all currently online CPUs.
+	 */
+	for_each_online_cpu(i) {
+		if (unlikely(!chan->buf[i])) {
+			printk(KERN_ERR "relay_late_setup_files: CPU %u "
+					"has no buffer, it must have!\n", i);
+			BUG();
+			err = -EINVAL;
+			break;
+		}
+
+		dentry = relay_create_buf_file(chan, chan->buf[i], i);
+		if (unlikely(!dentry)) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (curr_cpu == i) {
+			local_irq_save(flags);
+			relay_set_buf_dentry(chan->buf[i], dentry);
+			local_irq_restore(flags);
+		} else {
+			disp.buf = chan->buf[i];
+			disp.dentry = dentry;
+			smp_mb();
+			/* relay_channels_mutex must be held, so wait. */
+			err = smp_call_function_single(i,
+						       __relay_set_buf_dentry,
+						       &disp, 1);
+		}
+		if (unlikely(err))
+			break;
+	}
+	put_cpu();
+	mutex_unlock(&relay_channels_mutex);
+
+	return err;
+}
+
 /**
  *	relay_switch_subbuf - switch to a new sub-buffer
  *	@buf: channel buffer
@@ -627,8 +734,13 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 		old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
 		buf->padding[old_subbuf] = buf->prev_padding;
 		buf->subbufs_produced++;
-		buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
-			buf->padding[old_subbuf];
+		if (buf->dentry)
+			buf->dentry->d_inode->i_size +=
+				buf->chan->subbuf_size -
+				buf->padding[old_subbuf];
+		else
+			buf->early_bytes += buf->chan->subbuf_size -
+					    buf->padding[old_subbuf];
 		smp_mb();
 		if (waitqueue_active(&buf->read_wait))
 			/*
@@ -1237,4 +1349,4 @@ static __init int relay_init(void)
 	return 0;
 }
 
-module_init(relay_init);
+early_initcall(relay_init);
-- 
cgit v1.1


From 51cc50685a4275c6a02653670af9f108a64e01cf Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 25 Jul 2008 19:45:34 -0700
Subject: SL*B: drop kmem cache argument from constructor

Kmem cache passed to constructor is only needed for constructors that are
themselves multiplexeres.  Nobody uses this "feature", nor does anybody uses
passed kmem cache in non-trivial way, so pass only pointer to object.

Non-trivial places are:
	arch/powerpc/mm/init_64.c
	arch/powerpc/mm/hugetlbpage.c

This is flag day, yes.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jon Tollefson <kniht@linux.vnet.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Matt Mackall <mpm@selenic.com>
[akpm@linux-foundation.org: fix arch/powerpc/mm/hugetlbpage.c]
[akpm@linux-foundation.org: fix mm/slab.c]
[akpm@linux-foundation.org: fix ubifs]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index b99d73e..80e83e4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1442,7 +1442,7 @@ long do_fork(unsigned long clone_flags,
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
 
-static void sighand_ctor(struct kmem_cache *cachep, void *data)
+static void sighand_ctor(void *data)
 {
 	struct sighand_struct *sighand = data;
 
-- 
cgit v1.1


From b8c512f6190e313df69060bae4a161c5c044e272 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jul 2008 19:45:36 -0700
Subject: Use WARN() in kernel/irq/manage.c

Replace a printk+WARN_ON() by a WARN(); this increases the chance of the
string making it into the bugreport (ie: it goes inside the
---[ cut here ]--- section)

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/manage.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f8914b9..152abfd 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -177,8 +177,7 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq)
 {
 	switch (desc->depth) {
 	case 0:
-		printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
-		WARN_ON(1);
+		WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
 		break;
 	case 1: {
 		unsigned int status = desc->status & ~IRQ_DISABLED;
-- 
cgit v1.1


From 261c40c1191ad8d7a2e49fa2bb5f6a84e3d44b10 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jul 2008 19:45:37 -0700
Subject: use WARN() in kernel/irq/chip.c

Use WARN() instead of a printk+WARN_ON() pair; this way the message
becomes part of the warning section for better reporting/collection.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/chip.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 964964b..3cd441e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -28,8 +28,7 @@ void dynamic_irq_init(unsigned int irq)
 	unsigned long flags;
 
 	if (irq >= NR_IRQS) {
-		printk(KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
-		WARN_ON(1);
+		WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
 		return;
 	}
 
@@ -62,8 +61,7 @@ void dynamic_irq_cleanup(unsigned int irq)
 	unsigned long flags;
 
 	if (irq >= NR_IRQS) {
-		printk(KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
-		WARN_ON(1);
+		WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
 		return;
 	}
 
@@ -71,9 +69,8 @@ void dynamic_irq_cleanup(unsigned int irq)
 	spin_lock_irqsave(&desc->lock, flags);
 	if (desc->action) {
 		spin_unlock_irqrestore(&desc->lock, flags);
-		printk(KERN_ERR "Destroying IRQ%d without calling free_irq\n",
+		WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
 			irq);
-		WARN_ON(1);
 		return;
 	}
 	desc->msi_desc = NULL;
@@ -96,8 +93,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
 	unsigned long flags;
 
 	if (irq >= NR_IRQS) {
-		printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq);
-		WARN_ON(1);
+		WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
 		return -EINVAL;
 	}
 
-- 
cgit v1.1


From ff1188646c6870f336e910fb894eeed74f50471f Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:45 -0700
Subject: tracehook: unexport ptrace_notify

The ptrace_notify() function should not be called by any modules.  It was
only ever exported to be called by binfmt exec functions.  But that is no
longer necessary since fs/exec.c deals with that generically now.  There
should be no calls to ptrace_notify() from outside the core kernel.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 82c3545..8715c18 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1895,7 +1895,6 @@ EXPORT_SYMBOL(recalc_sigpending);
 EXPORT_SYMBOL_GPL(dequeue_signal);
 EXPORT_SYMBOL(flush_signals);
 EXPORT_SYMBOL(force_sig);
-EXPORT_SYMBOL(ptrace_notify);
 EXPORT_SYMBOL(send_sig);
 EXPORT_SYMBOL(send_sig_info);
 EXPORT_SYMBOL(sigprocmask);
-- 
cgit v1.1


From 30199f5a46aee204bf437a4f5b0740f3efe448b7 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:46 -0700
Subject: tracehook: exit

This moves the PTRACE_EVENT_EXIT tracing into a tracehook.h inline,
tracehook_report_exec().  The change has no effect, just clean-up.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index ad933bb..c3691cb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -46,6 +46,7 @@
 #include <linux/resource.h>
 #include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/tracehook.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1029,10 +1030,7 @@ NORET_TYPE void do_exit(long code)
 	if (unlikely(!tsk->pid))
 		panic("Attempted to kill the idle task!");
 
-	if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
-		current->ptrace_message = code;
-		ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
-	}
+	tracehook_report_exit(&code);
 
 	/*
 	 * We're taking recursive faults here in do_exit. Safest is to just
-- 
cgit v1.1


From 09a05394fe2448a4139b014936330af23fa7ec83 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:47 -0700
Subject: tracehook: clone

This moves all the ptrace initialization and tracing logic for task
creation into tracehook.h and ptrace.h inlines.  It reorganizes the code
slightly, but should not change any behavior.

There are four tracehook entry points, at each important stage of task
creation.  This keeps the interface from the core fork.c code fairly
clean, while supporting the complex setup required for ptrace or something
like it.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 69 ++++++++++++++++++++++++-----------------------------------
 1 file changed, 28 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 80e83e4..b42f8ed 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,6 +37,7 @@
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
+#include <linux/tracehook.h>
 #include <linux/futex.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/rcupdate.h>
@@ -865,8 +866,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
 
 	new_flags &= ~PF_SUPERPRIV;
 	new_flags |= PF_FORKNOEXEC;
-	if (!(clone_flags & CLONE_PTRACE))
-		p->ptrace = 0;
+	new_flags |= PF_STARTING;
 	p->flags = new_flags;
 	clear_freeze_flag(p);
 }
@@ -907,7 +907,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 					struct pt_regs *regs,
 					unsigned long stack_size,
 					int __user *child_tidptr,
-					struct pid *pid)
+					struct pid *pid,
+					int trace)
 {
 	int retval;
 	struct task_struct *p;
@@ -1163,8 +1164,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 */
 	p->group_leader = p;
 	INIT_LIST_HEAD(&p->thread_group);
-	INIT_LIST_HEAD(&p->ptrace_entry);
-	INIT_LIST_HEAD(&p->ptraced);
 
 	/* Now that the task is set up, run cgroup callbacks if
 	 * necessary. We need to run them before the task is visible
@@ -1195,7 +1194,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		p->real_parent = current->real_parent;
 	else
 		p->real_parent = current;
-	p->parent = p->real_parent;
 
 	spin_lock(&current->sighand->siglock);
 
@@ -1237,8 +1235,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	if (likely(p->pid)) {
 		list_add_tail(&p->sibling, &p->real_parent->children);
-		if (unlikely(p->ptrace & PT_PTRACED))
-			__ptrace_link(p, current->parent);
+		tracehook_finish_clone(p, clone_flags, trace);
 
 		if (thread_group_leader(p)) {
 			if (clone_flags & CLONE_NEWPID)
@@ -1323,29 +1320,13 @@ struct task_struct * __cpuinit fork_idle(int cpu)
 	struct pt_regs regs;
 
 	task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
-				&init_struct_pid);
+			    &init_struct_pid, 0);
 	if (!IS_ERR(task))
 		init_idle(task, cpu);
 
 	return task;
 }
 
-static int fork_traceflag(unsigned clone_flags)
-{
-	if (clone_flags & CLONE_UNTRACED)
-		return 0;
-	else if (clone_flags & CLONE_VFORK) {
-		if (current->ptrace & PT_TRACE_VFORK)
-			return PTRACE_EVENT_VFORK;
-	} else if ((clone_flags & CSIGNAL) != SIGCHLD) {
-		if (current->ptrace & PT_TRACE_CLONE)
-			return PTRACE_EVENT_CLONE;
-	} else if (current->ptrace & PT_TRACE_FORK)
-		return PTRACE_EVENT_FORK;
-
-	return 0;
-}
-
 /*
  *  Ok, this is the main fork-routine.
  *
@@ -1380,14 +1361,14 @@ long do_fork(unsigned long clone_flags,
 		}
 	}
 
-	if (unlikely(current->ptrace)) {
-		trace = fork_traceflag (clone_flags);
-		if (trace)
-			clone_flags |= CLONE_PTRACE;
-	}
+	/*
+	 * When called from kernel_thread, don't do user tracing stuff.
+	 */
+	if (likely(user_mode(regs)))
+		trace = tracehook_prepare_clone(clone_flags);
 
 	p = copy_process(clone_flags, stack_start, regs, stack_size,
-			child_tidptr, NULL);
+			 child_tidptr, NULL, trace);
 	/*
 	 * Do this prior waking up the new thread - the thread pointer
 	 * might get invalid after that point, if the thread exits quickly.
@@ -1405,24 +1386,30 @@ long do_fork(unsigned long clone_flags,
 			init_completion(&vfork);
 		}
 
-		if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
+		tracehook_report_clone(trace, regs, clone_flags, nr, p);
+
+		/*
+		 * We set PF_STARTING at creation in case tracing wants to
+		 * use this to distinguish a fully live task from one that
+		 * hasn't gotten to tracehook_report_clone() yet.  Now we
+		 * clear it and set the child going.
+		 */
+		p->flags &= ~PF_STARTING;
+
+		if (unlikely(clone_flags & CLONE_STOPPED)) {
 			/*
 			 * We'll start up with an immediate SIGSTOP.
 			 */
 			sigaddset(&p->pending.signal, SIGSTOP);
 			set_tsk_thread_flag(p, TIF_SIGPENDING);
-		}
-
-		if (!(clone_flags & CLONE_STOPPED))
-			wake_up_new_task(p, clone_flags);
-		else
 			__set_task_state(p, TASK_STOPPED);
-
-		if (unlikely (trace)) {
-			current->ptrace_message = nr;
-			ptrace_notify ((trace << 8) | SIGTRAP);
+		} else {
+			wake_up_new_task(p, clone_flags);
 		}
 
+		tracehook_report_clone_complete(trace, regs,
+						clone_flags, nr, p);
+
 		if (clone_flags & CLONE_VFORK) {
 			freezer_do_not_count();
 			wait_for_completion(&vfork);
-- 
cgit v1.1


From daded34be96b1975ff8539ff62ad8b158ce7d842 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:47 -0700
Subject: tracehook: vfork-done

This moves the PTRACE_EVENT_VFORK_DONE tracing into a tracehook.h inline,
tracehook_report_vfork_done().  The change has no effect, just clean-up.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index b42f8ed..abb3ed6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1414,10 +1414,7 @@ long do_fork(unsigned long clone_flags,
 			freezer_do_not_count();
 			wait_for_completion(&vfork);
 			freezer_count();
-			if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
-				current->ptrace_message = nr;
-				ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
-			}
+			tracehook_report_vfork_done(p, nr);
 		}
 	} else {
 		nr = PTR_ERR(p);
-- 
cgit v1.1


From dae33574dcf5211e1f43c7e45fa29f73ba3e00cb Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:48 -0700
Subject: tracehook: release_task

This moves the ptrace-related logic from release_task into tracehook.h and
ptrace.h inlines.  It provides clean hooks both before and after locking
tasklist_lock, for future tracing logic to do more cleanup without the
lock.

This also changes release_task() itself in the rare "zap_leader" case to
set the leader to EXIT_DEAD before iterating.  This maintains the
invariant that release_task() only ever handles a task in EXIT_DEAD.  This
is a common-sense invariant that is already always true except in this one
arcane case of zombie leader whose parent ignores SIGCHLD.

This change is harmless and only costs one store in this one rare case.
It keeps the expected state more consisently sane, which is nicer when
debugging weirdness in release_task().  It also lets some future code in
the tracehook entry points rely on this invariant for bookkeeping.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index c3691cb..da28745 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -163,27 +163,17 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 	put_task_struct(container_of(rhp, struct task_struct, rcu));
 }
 
-/*
- * Do final ptrace-related cleanup of a zombie being reaped.
- *
- * Called with write_lock(&tasklist_lock) held.
- */
-static void ptrace_release_task(struct task_struct *p)
-{
-	BUG_ON(!list_empty(&p->ptraced));
-	ptrace_unlink(p);
-	BUG_ON(!list_empty(&p->ptrace_entry));
-}
 
 void release_task(struct task_struct * p)
 {
 	struct task_struct *leader;
 	int zap_leader;
 repeat:
+	tracehook_prepare_release_task(p);
 	atomic_dec(&p->user->processes);
 	proc_flush_task(p);
 	write_lock_irq(&tasklist_lock);
-	ptrace_release_task(p);
+	tracehook_finish_release_task(p);
 	__exit_signal(p);
 
 	/*
@@ -205,6 +195,13 @@ repeat:
 		 * that case.
 		 */
 		zap_leader = task_detached(leader);
+
+		/*
+		 * This maintains the invariant that release_task()
+		 * only runs on a task in EXIT_DEAD, just for sanity.
+		 */
+		if (zap_leader)
+			leader->exit_state = EXIT_DEAD;
 	}
 
 	write_unlock_irq(&tasklist_lock);
-- 
cgit v1.1


From 35de254dc60f91004b3b5ebb1fc7b2c3093d6032 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:51 -0700
Subject: tracehook: tracehook_consider_ignored_signal

This defines tracehook_consider_ignored_signal() has a fine-grained hook
for deciding to prevent the normal short-circuit of sending an ignored
signal, as ptrace does.  There is no change, only cleanup.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 8715c18..9efd1ce 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,6 +22,7 @@
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/signalfd.h>
+#include <linux/tracehook.h>
 #include <linux/capability.h>
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
@@ -39,24 +40,21 @@
 
 static struct kmem_cache *sigqueue_cachep;
 
-static int __sig_ignored(struct task_struct *t, int sig)
+static void __user *sig_handler(struct task_struct *t, int sig)
 {
-	void __user *handler;
+	return t->sighand->action[sig - 1].sa.sa_handler;
+}
 
+static int sig_handler_ignored(void __user *handler, int sig)
+{
 	/* Is it explicitly or implicitly ignored? */
-
-	handler = t->sighand->action[sig - 1].sa.sa_handler;
 	return handler == SIG_IGN ||
 		(handler == SIG_DFL && sig_kernel_ignore(sig));
 }
 
 static int sig_ignored(struct task_struct *t, int sig)
 {
-	/*
-	 * Tracers always want to know about signals..
-	 */
-	if (t->ptrace & PT_PTRACED)
-		return 0;
+	void __user *handler;
 
 	/*
 	 * Blocked signals are never ignored, since the
@@ -66,7 +64,14 @@ static int sig_ignored(struct task_struct *t, int sig)
 	if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
 		return 0;
 
-	return __sig_ignored(t, sig);
+	handler = sig_handler(t, sig);
+	if (!sig_handler_ignored(handler, sig))
+		return 0;
+
+	/*
+	 * Tracers may want to know about even ignored signals.
+	 */
+	return !tracehook_consider_ignored_signal(t, sig, handler);
 }
 
 /*
@@ -2298,7 +2303,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 		 *   (for example, SIGCHLD), shall cause the pending signal to
 		 *   be discarded, whether or not it is blocked"
 		 */
-		if (__sig_ignored(t, sig)) {
+		if (sig_handler_ignored(sig_handler(t, sig), sig)) {
 			sigemptyset(&mask);
 			sigaddset(&mask, sig);
 			rm_from_queue_full(&mask, &t->signal->shared_pending);
-- 
cgit v1.1


From 445a91d2fe3667fb8fc251433645f686933cf56a Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:52 -0700
Subject: tracehook: tracehook_consider_fatal_signal

This defines tracehook_consider_fatal_signal() has a fine-grained hook for
deciding to skip the special cases for a fatal signal, as ptrace does.
There is no change, only cleanup.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 9efd1ce..1a942ce 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -300,12 +300,12 @@ flush_signal_handlers(struct task_struct *t, int force_default)
 
 int unhandled_signal(struct task_struct *tsk, int sig)
 {
+	void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
 	if (is_global_init(tsk))
 		return 1;
-	if (tsk->ptrace & PT_PTRACED)
+	if (handler != SIG_IGN && handler != SIG_DFL)
 		return 0;
-	return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
-		(tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
+	return !tracehook_consider_fatal_signal(tsk, sig, handler);
 }
 
 
@@ -761,7 +761,8 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 	if (sig_fatal(p, sig) &&
 	    !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
 	    !sigismember(&t->real_blocked, sig) &&
-	    (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
+	    (sig == SIGKILL ||
+	     !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) {
 		/*
 		 * This signal will be fatal to the whole group.
 		 */
-- 
cgit v1.1


From 7bcf6a2ca5f639b038c48711ebe6c4eca2036641 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:53 -0700
Subject: tracehook: get_signal_to_deliver

This defines the tracehook_get_signal() hook to allow tracing code to slip
in before normal signal dequeuing.  This lays the groundwork for new
tracing features that can inject synthetic signals outside the normal
queue or control the disposition of delivered signals.  The calling
convention lets tracehook_get_signal() decide both exactly what will
happen and what signal number to report in the handler/exit.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 1a942ce..10b31ec 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1754,17 +1754,33 @@ relock:
 		    do_signal_stop(0))
 			goto relock;
 
-		signr = dequeue_signal(current, &current->blocked, info);
-		if (!signr)
-			break; /* will return 0 */
+		/*
+		 * Tracing can induce an artifical signal and choose sigaction.
+		 * The return value in @signr determines the default action,
+		 * but @info->si_signo is the signal number we will report.
+		 */
+		signr = tracehook_get_signal(current, regs, info, return_ka);
+		if (unlikely(signr < 0))
+			goto relock;
+		if (unlikely(signr != 0))
+			ka = return_ka;
+		else {
+			signr = dequeue_signal(current, &current->blocked,
+					       info);
 
-		if (signr != SIGKILL) {
-			signr = ptrace_signal(signr, info, regs, cookie);
 			if (!signr)
-				continue;
+				break; /* will return 0 */
+
+			if (signr != SIGKILL) {
+				signr = ptrace_signal(signr, info,
+						      regs, cookie);
+				if (!signr)
+					continue;
+			}
+
+			ka = &sighand->action[signr-1];
 		}
 
-		ka = &sighand->action[signr-1];
 		if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
 			continue;
 		if (ka->sa.sa_handler != SIG_DFL) {
@@ -1812,7 +1828,7 @@ relock:
 				spin_lock_irq(&sighand->siglock);
 			}
 
-			if (likely(do_signal_stop(signr))) {
+			if (likely(do_signal_stop(info->si_signo))) {
 				/* It released the siglock.  */
 				goto relock;
 			}
@@ -1833,7 +1849,7 @@ relock:
 
 		if (sig_kernel_coredump(signr)) {
 			if (print_fatal_signals)
-				print_fatal_signal(regs, signr);
+				print_fatal_signal(regs, info->si_signo);
 			/*
 			 * If it was able to dump core, this kills all
 			 * other threads in the group and synchronizes with
@@ -1842,13 +1858,13 @@ relock:
 			 * first and our do_group_exit call below will use
 			 * that value and ignore the one we pass it.
 			 */
-			do_coredump((long)signr, signr, regs);
+			do_coredump(info->si_signo, info->si_signo, regs);
 		}
 
 		/*
 		 * Death signals, no core dump.
 		 */
-		do_group_exit(signr);
+		do_group_exit(info->si_signo);
 		/* NOTREACHED */
 	}
 	spin_unlock_irq(&sighand->siglock);
-- 
cgit v1.1


From fa00b80b3c41a845b3d56f866fb40a2e98754c51 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:54 -0700
Subject: tracehook: job control

This defines the tracehook_notify_jctl() hook to formalize the ptrace
effects on the job control notifications.  There is no change, only
cleanup.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 10b31ec..e9e699f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -596,9 +596,6 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	return security_task_kill(t, info, sig, 0);
 }
 
-/* forward decl */
-static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
-
 /*
  * Handle magic process-wide effects of stop/continue signals. Unlike
  * the signal actions, these happen immediately at signal-generation
@@ -1605,7 +1602,7 @@ finish_stop(int stop_count)
 	 * a group stop in progress and we are the last to stop,
 	 * report to the parent.  When ptraced, every thread reports itself.
 	 */
-	if (stop_count == 0 || (current->ptrace & PT_PTRACED)) {
+	if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
 		read_lock(&tasklist_lock);
 		do_notify_parent_cldstop(current, CLD_STOPPED);
 		read_unlock(&tasklist_lock);
@@ -1741,6 +1738,9 @@ relock:
 		signal->flags &= ~SIGNAL_CLD_MASK;
 		spin_unlock_irq(&sighand->siglock);
 
+		if (unlikely(!tracehook_notify_jctl(1, why)))
+			goto relock;
+
 		read_lock(&tasklist_lock);
 		do_notify_parent_cldstop(current->group_leader, why);
 		read_unlock(&tasklist_lock);
@@ -1906,7 +1906,7 @@ void exit_signals(struct task_struct *tsk)
 out:
 	spin_unlock_irq(&tsk->sighand->siglock);
 
-	if (unlikely(group_stop)) {
+	if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) {
 		read_lock(&tasklist_lock);
 		do_notify_parent_cldstop(tsk, CLD_STOPPED);
 		read_unlock(&tasklist_lock);
-- 
cgit v1.1


From 2b2a1ff64afbadac842bbc58c5166962cf4f7664 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:54 -0700
Subject: tracehook: death

This moves the ptrace logic in task death (exit_notify) into tracehook.h
inlines.  Some code is rearranged slightly to make things nicer.  There is
no change, only cleanup.

There is one hook called with the tasklist_lock write-locked, as ptrace
needs.  There is also a new hook called after exit_state changes and
without locks.  This is a better place for tracing work to be in the
future, since it doesn't delay the whole system with locking.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c   | 26 +++++++++-----------------
 kernel/signal.c | 10 +++++++---
 2 files changed, 16 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index da28745..6cdf607 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -885,7 +885,8 @@ static void forget_original_parent(struct task_struct *father)
  */
 static void exit_notify(struct task_struct *tsk, int group_dead)
 {
-	int state;
+	int signal;
+	void *cookie;
 
 	/*
 	 * This does two things:
@@ -922,22 +923,11 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	    !capable(CAP_KILL))
 		tsk->exit_signal = SIGCHLD;
 
-	/* If something other than our normal parent is ptracing us, then
-	 * send it a SIGCHLD instead of honoring exit_signal.  exit_signal
-	 * only has special meaning to our real parent.
-	 */
-	if (!task_detached(tsk) && thread_group_empty(tsk)) {
-		int signal = ptrace_reparented(tsk) ?
-				SIGCHLD : tsk->exit_signal;
-		do_notify_parent(tsk, signal);
-	} else if (tsk->ptrace) {
-		do_notify_parent(tsk, SIGCHLD);
-	}
+	signal = tracehook_notify_death(tsk, &cookie, group_dead);
+	if (signal > 0)
+		signal = do_notify_parent(tsk, signal);
 
-	state = EXIT_ZOMBIE;
-	if (task_detached(tsk) && likely(!tsk->ptrace))
-		state = EXIT_DEAD;
-	tsk->exit_state = state;
+	tsk->exit_state = signal < 0 ? EXIT_DEAD : EXIT_ZOMBIE;
 
 	/* mt-exec, de_thread() is waiting for us */
 	if (thread_group_leader(tsk) &&
@@ -947,8 +937,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 
 	write_unlock_irq(&tasklist_lock);
 
+	tracehook_report_death(tsk, signal, cookie, group_dead);
+
 	/* If the process is dead, release it - nobody will wait for it */
-	if (state == EXIT_DEAD)
+	if (signal < 0)
 		release_task(tsk);
 }
 
diff --git a/kernel/signal.c b/kernel/signal.c
index e9e699f..0e862d3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1326,9 +1326,11 @@ static inline void __wake_up_parent(struct task_struct *p,
 /*
  * Let a parent know about the death of a child.
  * For a stopped/continued status change, use do_notify_parent_cldstop instead.
+ *
+ * Returns -1 if our parent ignored us and so we've switched to
+ * self-reaping, or else @sig.
  */
-
-void do_notify_parent(struct task_struct *tsk, int sig)
+int do_notify_parent(struct task_struct *tsk, int sig)
 {
 	struct siginfo info;
 	unsigned long flags;
@@ -1399,12 +1401,14 @@ void do_notify_parent(struct task_struct *tsk, int sig)
 		 */
 		tsk->exit_signal = -1;
 		if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
-			sig = 0;
+			sig = -1;
 	}
 	if (valid_signal(sig) && sig > 0)
 		__group_send_sig_info(sig, &info, tsk->parent);
 	__wake_up_parent(tsk, tsk->parent);
 	spin_unlock_irqrestore(&psig->siglock, flags);
+
+	return sig;
 }
 
 static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
-- 
cgit v1.1


From b787f7ba677840da16a2228c16571ce8a1fcb799 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:55 -0700
Subject: tracehook: force signal_pending()

This defines a new hook tracehook_force_sigpending() that lets tracing
code decide to force TIF_SIGPENDING on in recalc_sigpending().

This is not used yet, so it compiles away to nothing for now.  It lays the
groundwork for new tracing code that can interrupt a task synthetically
without actually sending a signal.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 0e862d3..954f77d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -134,7 +134,9 @@ void recalc_sigpending_and_wake(struct task_struct *t)
 
 void recalc_sigpending(void)
 {
-	if (!recalc_sigpending_tsk(current) && !freezing(current))
+	if (unlikely(tracehook_force_sigpending()))
+		set_thread_flag(TIF_SIGPENDING);
+	else if (!recalc_sigpending_tsk(current) && !freezing(current))
 		clear_thread_flag(TIF_SIGPENDING);
 
 }
-- 
cgit v1.1


From 85ba2d862e521375a8ee01526c5c46b1f24bb4af Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:58 -0700
Subject: tracehook: wait_task_inactive

This extends wait_task_inactive() with a new argument so it can be used in
a "soft" mode where it will check for the task changing state unexpectedly
and back off.  There is no change to existing callers.  This lays the
groundwork to allow robust, noninvasive tracing that can try to sample a
blocked thread but back off safely if it wakes up.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kthread.c |  2 +-
 kernel/ptrace.c  |  2 +-
 kernel/sched.c   | 29 +++++++++++++++++++++++++++--
 3 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 6111c27..96cff2f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -176,7 +176,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
 		return;
 	}
 	/* Must have done schedule() in kthread() before we set_task_cpu */
-	wait_task_inactive(k);
+	wait_task_inactive(k, 0);
 	set_task_cpu(k, cpu);
 	k->cpus_allowed = cpumask_of_cpu(cpu);
 	k->rt.nr_cpus_allowed = 1;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8392a9d..082b3fc 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -107,7 +107,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 	read_unlock(&tasklist_lock);
 
 	if (!ret && !kill)
-		wait_task_inactive(child);
+		ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
 
 	/* All systems go.. */
 	return ret;
diff --git a/kernel/sched.c b/kernel/sched.c
index fde1a10..0236958 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1867,16 +1867,24 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change.  If it changes, i.e. @p might have woken up,
+ * then return zero.  When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count).  If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
  * The caller must ensure that the task *will* unschedule sometime soon,
  * else this function might spin for a *long* time. This function can't
  * be called with interrupts off, or it may introduce deadlock with
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
  */
-void wait_task_inactive(struct task_struct *p)
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
 	unsigned long flags;
 	int running, on_rq;
+	unsigned long ncsw;
 	struct rq *rq;
 
 	for (;;) {
@@ -1899,8 +1907,11 @@ void wait_task_inactive(struct task_struct *p)
 		 * return false if the runqueue has changed and p
 		 * is actually now running somewhere else!
 		 */
-		while (task_running(rq, p))
+		while (task_running(rq, p)) {
+			if (match_state && unlikely(p->state != match_state))
+				return 0;
 			cpu_relax();
+		}
 
 		/*
 		 * Ok, time to look more closely! We need the rq
@@ -1910,9 +1921,21 @@ void wait_task_inactive(struct task_struct *p)
 		rq = task_rq_lock(p, &flags);
 		running = task_running(rq, p);
 		on_rq = p->se.on_rq;
+		ncsw = 0;
+		if (!match_state || p->state == match_state) {
+			ncsw = p->nivcsw + p->nvcsw;
+			if (unlikely(!ncsw))
+				ncsw = 1;
+		}
 		task_rq_unlock(rq, &flags);
 
 		/*
+		 * If it changed from the expected state, bail out now.
+		 */
+		if (unlikely(!ncsw))
+			break;
+
+		/*
 		 * Was it really running after all now that we
 		 * checked with the proper locks actually held?
 		 *
@@ -1944,6 +1967,8 @@ void wait_task_inactive(struct task_struct *p)
 		 */
 		break;
 	}
+
+	return ncsw;
 }
 
 /***
-- 
cgit v1.1


From 96930a6365c99c160138a395566e360b27348b8f Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 25 Jul 2008 19:46:21 -0700
Subject: make cgroup_seqfile_release() static

cgroup_seqfile_release() can become static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 66ec9fd..89bd6fb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1529,7 +1529,7 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 	return cft->read_seq_string(state->cgroup, cft, m);
 }
 
-int cgroup_seqfile_release(struct inode *inode, struct file *file)
+static int cgroup_seqfile_release(struct inode *inode, struct file *file)
 {
 	struct seq_file *seq = file->private_data;
 	kfree(seq->private);
-- 
cgit v1.1


From 734550921e9b7ab924a43aa3d0bd4239dac4fbf1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 Jul 2008 21:22:20 -0400
Subject: [PATCH] beginning of sysctl cleanup - ctl_table_set

New object: set of sysctls [currently - root and per-net-ns].
Contains: pointer to parent set, list of tables and "should I see this set?"
method (->is_seen(set)).
Current lists of tables are subsumed by that; net-ns contains such a beast.
->lookup() for ctl_table_root returns pointer to ctl_table_set instead of
that to ->list of that ctl_table_set.

[folded compile fixes by rdd for configs without sysctl]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/sysctl.c | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 35a50db..8ee4a06 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -160,12 +160,13 @@ static struct ctl_table root_table[];
 static struct ctl_table_root sysctl_table_root;
 static struct ctl_table_header root_table_header = {
 	.ctl_table = root_table,
-	.ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list),
+	.ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),
 	.root = &sysctl_table_root,
+	.set = &sysctl_table_root.default_set,
 };
 static struct ctl_table_root sysctl_table_root = {
 	.root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
-	.header_list = LIST_HEAD_INIT(root_table_header.ctl_entry),
+	.default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
 };
 
 static struct ctl_table kern_table[];
@@ -1403,14 +1404,20 @@ void sysctl_head_finish(struct ctl_table_header *head)
 	spin_unlock(&sysctl_lock);
 }
 
+static struct ctl_table_set *
+lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+	struct ctl_table_set *set = &root->default_set;
+	if (root->lookup)
+		set = root->lookup(root, namespaces);
+	return set;
+}
+
 static struct list_head *
 lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
 {
-	struct list_head *header_list;
-	header_list = &root->header_list;
-	if (root->lookup)
-		header_list = root->lookup(root, namespaces);
-	return header_list;
+	struct ctl_table_set *set = lookup_header_set(root, namespaces);
+	return &set->list;
 }
 
 struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
@@ -1720,7 +1727,6 @@ struct ctl_table_header *__register_sysctl_paths(
 	struct nsproxy *namespaces,
 	const struct ctl_path *path, struct ctl_table *table)
 {
-	struct list_head *header_list;
 	struct ctl_table_header *header;
 	struct ctl_table *new, **prevp;
 	unsigned int n, npath;
@@ -1772,8 +1778,8 @@ struct ctl_table_header *__register_sysctl_paths(
 	}
 #endif
 	spin_lock(&sysctl_lock);
-	header_list = lookup_header_list(root, namespaces);
-	list_add_tail(&header->ctl_entry, header_list);
+	header->set = lookup_header_set(root, namespaces);
+	list_add_tail(&header->ctl_entry, &header->set->list);
 	spin_unlock(&sysctl_lock);
 
 	return header;
@@ -1832,6 +1838,15 @@ void unregister_sysctl_table(struct ctl_table_header * header)
 	kfree(header);
 }
 
+void setup_sysctl_set(struct ctl_table_set *p,
+	struct ctl_table_set *parent,
+	int (*is_seen)(struct ctl_table_set *))
+{
+	INIT_LIST_HEAD(&p->list);
+	p->parent = parent ? parent : &sysctl_table_root.default_set;
+	p->is_seen = is_seen;
+}
+
 #else /* !CONFIG_SYSCTL */
 struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
 {
@@ -1848,6 +1863,12 @@ void unregister_sysctl_table(struct ctl_table_header * table)
 {
 }
 
+void setup_sysctl_set(struct ctl_table_set *p,
+	struct ctl_table_set *parent,
+	int (*is_seen)(struct ctl_table_set *))
+{
+}
+
 #endif /* CONFIG_SYSCTL */
 
 /*
-- 
cgit v1.1


From f7e6ced4061da509f737541ca4dbd44d83a6e82f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 15 Jul 2008 01:44:23 -0400
Subject: [PATCH] allow delayed freeing of ctl_table_header

Refcount the sucker; instead of freeing it by the end of unregistration
just drop the refcount and free only when it hits zero.  Make sure that
we _always_ make ->unregistering non-NULL in start_unregistering().

That allows anybody to get a reference to such puppy, preventing its
freeing and reuse.  It does *not* block unregistration.  Anybody who
holds such a reference can
	* try to grab a "use" reference (ctl_head_grab()); that will
succeeds if and only if it hadn't entered unregistration yet.  If it
succeeds, we can use it in all normal ways until we release the "use"
reference (with ctl_head_finish()).  Note that this relies on having
->unregistering become non-NULL in all cases when one starts to unregister
the sucker.
	* keep pointers to ctl_table entries; they *can* be freed if
the entire thing is unregistered.  However, if ctl_head_grab() succeeds,
we know that unregistration had not happened (and will not happen until
ctl_head_finish()) and such pointers can be used safely.

IOW, now we can have inodes under /proc/sys keep references to ctl_table
entries, protecting them with references to ctl_table_header and
grabbing the latter for the duration of operations that require access
to ctl_table.  That won't cause deadlocks, since unregistration will not
be stopped by mere keeping a reference to ctl_table_header.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/sysctl.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8ee4a06..60d9357 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1387,6 +1387,9 @@ static void start_unregistering(struct ctl_table_header *p)
 		spin_unlock(&sysctl_lock);
 		wait_for_completion(&wait);
 		spin_lock(&sysctl_lock);
+	} else {
+		/* anything non-NULL; we'll never dereference it */
+		p->unregistering = ERR_PTR(-EINVAL);
 	}
 	/*
 	 * do not remove from the list until nobody holds it; walking the
@@ -1395,6 +1398,32 @@ static void start_unregistering(struct ctl_table_header *p)
 	list_del_init(&p->ctl_entry);
 }
 
+void sysctl_head_get(struct ctl_table_header *head)
+{
+	spin_lock(&sysctl_lock);
+	head->count++;
+	spin_unlock(&sysctl_lock);
+}
+
+void sysctl_head_put(struct ctl_table_header *head)
+{
+	spin_lock(&sysctl_lock);
+	if (!--head->count)
+		kfree(head);
+	spin_unlock(&sysctl_lock);
+}
+
+struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
+{
+	if (!head)
+		BUG();
+	spin_lock(&sysctl_lock);
+	if (!use_table(head))
+		head = ERR_PTR(-ENOENT);
+	spin_unlock(&sysctl_lock);
+	return head;
+}
+
 void sysctl_head_finish(struct ctl_table_header *head)
 {
 	if (!head)
@@ -1771,6 +1800,7 @@ struct ctl_table_header *__register_sysctl_paths(
 	header->unregistering = NULL;
 	header->root = root;
 	sysctl_set_parent(NULL, header->ctl_table);
+	header->count = 1;
 #ifdef CONFIG_SYSCTL_SYSCALL_CHECK
 	if (sysctl_check_table(namespaces, header->ctl_table)) {
 		kfree(header);
@@ -1834,8 +1864,9 @@ void unregister_sysctl_table(struct ctl_table_header * header)
 
 	spin_lock(&sysctl_lock);
 	start_unregistering(header);
+	if (!--header->count)
+		kfree(header);
 	spin_unlock(&sysctl_lock);
-	kfree(header);
 }
 
 void setup_sysctl_set(struct ctl_table_set *p,
@@ -1869,6 +1900,10 @@ void setup_sysctl_set(struct ctl_table_set *p,
 {
 }
 
+void sysctl_head_put(struct ctl_table_header *head)
+{
+}
+
 #endif /* CONFIG_SYSCTL */
 
 /*
-- 
cgit v1.1


From ae7edecc9b8810770a8e5cb9a466ea4bdcfa8401 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 15 Jul 2008 06:33:31 -0400
Subject: [PATCH] sysctl: keep track of tree relationships

In a sense, that's the heart of the series.  It's based on the following
property of the trees we are actually asked to add: they can be split into
stem that is already covered by registered trees and crown that is entirely
new.  IOW, if a/b and a/c/d are introduced by our tree, then a/c is also
introduced by it.

That allows to associate tree and table entry with each node in the union;
while directory nodes might be covered by many trees, only one will cover
the node by its crown.  And that will allow much saner logics for /proc/sys
in the next patches.  This patch introduces the data structures needed to
keep track of that.

When adding a sysctl table, we find a "parent" one.  Which is to say,
find the deepest node on its stem that already is present in one of the
tables from our table set or its ancestor sets.  That table will be our
parent and that node in it - attachment point.  Add our table to list
anchored in parent, have it refer the parent and contents of attachment
point.  Also remember where its crown lives.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/sysctl.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 60d9357..c9a0af8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1680,6 +1680,52 @@ static __init int sysctl_init(void)
 
 core_initcall(sysctl_init);
 
+static int is_branch_in(struct ctl_table *branch, struct ctl_table *table)
+{
+	struct ctl_table *p;
+	const char *s = branch->procname;
+
+	/* branch should have named subdirectory as its first element */
+	if (!s || !branch->child)
+		return 0;
+
+	/* ... and nothing else */
+	if (branch[1].procname || branch[1].ctl_name)
+		return 0;
+
+	/* table should contain subdirectory with the same name */
+	for (p = table; p->procname || p->ctl_name; p++) {
+		if (!p->child)
+			continue;
+		if (p->procname && strcmp(p->procname, s) == 0)
+			return 1;
+	}
+	return 0;
+}
+
+/* see if attaching q to p would be an improvement */
+static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
+{
+	struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
+	int is_better = 0;
+	int not_in_parent = !p->attached_by;
+
+	while (is_branch_in(by, to)) {
+		if (by == q->attached_by)
+			is_better = 1;
+		if (to == p->attached_by)
+			not_in_parent = 1;
+		by = by->child;
+		to = to->child;
+	}
+
+	if (is_better && not_in_parent) {
+		q->attached_by = by;
+		q->attached_to = to;
+		q->parent = p;
+	}
+}
+
 /**
  * __register_sysctl_paths - register a sysctl hierarchy
  * @root: List of sysctl headers to register on
@@ -1759,6 +1805,7 @@ struct ctl_table_header *__register_sysctl_paths(
 	struct ctl_table_header *header;
 	struct ctl_table *new, **prevp;
 	unsigned int n, npath;
+	struct ctl_table_set *set;
 
 	/* Count the path components */
 	for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath)
@@ -1809,6 +1856,18 @@ struct ctl_table_header *__register_sysctl_paths(
 #endif
 	spin_lock(&sysctl_lock);
 	header->set = lookup_header_set(root, namespaces);
+	header->attached_by = header->ctl_table;
+	header->attached_to = root_table;
+	header->parent = &root_table_header;
+	for (set = header->set; set; set = set->parent) {
+		struct ctl_table_header *p;
+		list_for_each_entry(p, &set->list, ctl_entry) {
+			if (p->unregistering)
+				continue;
+			try_attach(p, header);
+		}
+	}
+	header->parent->count++;
 	list_add_tail(&header->ctl_entry, &header->set->list);
 	spin_unlock(&sysctl_lock);
 
@@ -1864,6 +1923,10 @@ void unregister_sysctl_table(struct ctl_table_header * header)
 
 	spin_lock(&sysctl_lock);
 	start_unregistering(header);
+	if (!--header->parent->count) {
+		WARN_ON(1);
+		kfree(header->parent);
+	}
 	if (!--header->count)
 		kfree(header);
 	spin_unlock(&sysctl_lock);
-- 
cgit v1.1


From 9043476f726802f4b00c96d0c4f418dde48d1304 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 15 Jul 2008 08:54:06 -0400
Subject: [PATCH] sanitize proc_sysctl

* keep references to ctl_table_head and ctl_table in /proc/sys inodes
* grab the former during operations, use the latter for access to
  entry if that succeeds
* have ->d_compare() check if table should be seen for one who does lookup;
  that allows us to avoid flipping inodes - if we have the same name resolve
  to different things, we'll just keep several dentries and ->d_compare()
  will reject the wrong ones.
* have ->lookup() and ->readdir() scan the table of our inode first, then
  walk all ctl_table_header and scan ->attached_by for those that are
  attached to our directory.
* implement ->getattr().
* get rid of insane amounts of tree-walking
* get rid of the need to know dentry in ->permission() and of the contortions
  induced by that.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/sysctl.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c9a0af8..ff5abcc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1932,6 +1932,21 @@ void unregister_sysctl_table(struct ctl_table_header * header)
 	spin_unlock(&sysctl_lock);
 }
 
+int sysctl_is_seen(struct ctl_table_header *p)
+{
+	struct ctl_table_set *set = p->set;
+	int res;
+	spin_lock(&sysctl_lock);
+	if (p->unregistering)
+		res = 0;
+	else if (!set->is_seen)
+		res = 1;
+	else
+		res = set->is_seen(set);
+	spin_unlock(&sysctl_lock);
+	return res;
+}
+
 void setup_sysctl_set(struct ctl_table_set *p,
 	struct ctl_table_set *parent,
 	int (*is_seen)(struct ctl_table_set *))
-- 
cgit v1.1


From e6305c43eda10ebfd2ad9e35d6e172ccc7bb3695 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 15 Jul 2008 21:03:57 -0400
Subject: [PATCH] sanitize ->permission() prototype

* kill nameidata * argument; map the 3 bits in ->flags anybody cares
  about to new MAY_... ones and pass with the mask.
* kill redundant gfs2_iop_permission()
* sanitize ecryptfs_permission()
* fix remaining places where ->permission() instances might barf on new
  MAY_... found in mask.

The obvious next target in that direction is permission(9)

folded fix for nfs_permission() breakage from Miklos Szeredi <mszeredi@suse.cz>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/sysctl.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ff5abcc..911d846 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1516,9 +1516,9 @@ static int do_sysctl_strategy(struct ctl_table_root *root,
 	int op = 0, rc;
 
 	if (oldval)
-		op |= 004;
+		op |= MAY_READ;
 	if (newval)
-		op |= 002;
+		op |= MAY_WRITE;
 	if (sysctl_perm(root, table, op))
 		return -EPERM;
 
@@ -1560,7 +1560,7 @@ repeat:
 		if (n == table->ctl_name) {
 			int error;
 			if (table->child) {
-				if (sysctl_perm(root, table, 001))
+				if (sysctl_perm(root, table, MAY_EXEC))
 					return -EPERM;
 				name++;
 				nlen--;
@@ -1635,7 +1635,7 @@ static int test_perm(int mode, int op)
 		mode >>= 6;
 	else if (in_egroup_p(0))
 		mode >>= 3;
-	if ((mode & op & 0007) == op)
+	if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
 		return 0;
 	return -EACCES;
 }
@@ -1645,7 +1645,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
 	int error;
 	int mode;
 
-	error = security_sysctl(table, op);
+	error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
 	if (error)
 		return error;
 
-- 
cgit v1.1


From 7f2da1e7d0330395e5e9e350b879b98a1ea495df Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 10 May 2008 20:44:54 -0400
Subject: [PATCH] kill altroot

long overdue...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/exec_domain.c | 1 -
 kernel/exit.c        | 2 --
 kernel/fork.c        | 7 -------
 3 files changed, 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index c1ef192..0d407e8 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -168,7 +168,6 @@ __set_personality(u_long personality)
 	current->personality = personality;
 	oep = current_thread_info()->exec_domain;
 	current_thread_info()->exec_domain = ep;
-	set_fs_altroot();
 
 	module_put(oep->module);
 	return 0;
diff --git a/kernel/exit.c b/kernel/exit.c
index 6cdf607..0caf590 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -565,8 +565,6 @@ void put_fs_struct(struct fs_struct *fs)
 	if (atomic_dec_and_test(&fs->count)) {
 		path_put(&fs->root);
 		path_put(&fs->pwd);
-		if (fs->altroot.dentry)
-			path_put(&fs->altroot);
 		kmem_cache_free(fs_cachep, fs);
 	}
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index abb3ed6..5e050c1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -657,13 +657,6 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
 		path_get(&old->root);
 		fs->pwd = old->pwd;
 		path_get(&old->pwd);
-		if (old->altroot.dentry) {
-			fs->altroot = old->altroot;
-			path_get(&old->altroot);
-		} else {
-			fs->altroot.mnt = NULL;
-			fs->altroot.dentry = NULL;
-		}
 		read_unlock(&old->lock);
 	}
 	return fs;
-- 
cgit v1.1


From 3f8206d496e9e9495afb1d4e70d29712b4d403c9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 26 Jul 2008 03:46:43 -0400
Subject: [PATCH] get rid of indirect users of namei.h

fs.h needs path.h, not namei.h; nfs_fs.h doesn't need it at all.
Several places in the tree needed direct include.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/cgroup.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 89bd6fb..657f8f8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -45,6 +45,7 @@
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
 #include <linux/hash.h>
+#include <linux/namei.h>
 
 #include <asm/atomic.h>
 
-- 
cgit v1.1


From bfbcf034798b2ca45338cee5049b5694b7ddc865 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Sun, 27 Jul 2008 06:31:22 +0100
Subject: lost sysctl fix

try_attach() should walk into the matching subdirectory, not the first one...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Valdis.Kletnieks@vt.edu
Tested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 911d846..fe47133 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1680,43 +1680,45 @@ static __init int sysctl_init(void)
 
 core_initcall(sysctl_init);
 
-static int is_branch_in(struct ctl_table *branch, struct ctl_table *table)
+static struct ctl_table *is_branch_in(struct ctl_table *branch,
+				      struct ctl_table *table)
 {
 	struct ctl_table *p;
 	const char *s = branch->procname;
 
 	/* branch should have named subdirectory as its first element */
 	if (!s || !branch->child)
-		return 0;
+		return NULL;
 
 	/* ... and nothing else */
 	if (branch[1].procname || branch[1].ctl_name)
-		return 0;
+		return NULL;
 
 	/* table should contain subdirectory with the same name */
 	for (p = table; p->procname || p->ctl_name; p++) {
 		if (!p->child)
 			continue;
 		if (p->procname && strcmp(p->procname, s) == 0)
-			return 1;
+			return p;
 	}
-	return 0;
+	return NULL;
 }
 
 /* see if attaching q to p would be an improvement */
 static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
 {
 	struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
+	struct ctl_table *next;
 	int is_better = 0;
 	int not_in_parent = !p->attached_by;
 
-	while (is_branch_in(by, to)) {
+	while ((next = is_branch_in(by, to)) != NULL) {
 		if (by == q->attached_by)
 			is_better = 1;
 		if (to == p->attached_by)
 			not_in_parent = 1;
 		by = by->child;
-		to = to->child;
+		to = next->child;
 	}
 
 	if (is_better && not_in_parent) {
-- 
cgit v1.1


From 605ccb73f6a1c891a16268b3a2923208fc637958 Mon Sep 17 00:00:00 2001
From: Andrea Righi <righi.andrea@gmail.com>
Date: Sun, 27 Jul 2008 13:39:03 +0200
Subject: tracing: remove unused variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the following warning with CONFIG_TRACING=y:

	kernel/trace/trace.c: In function ‘s_next’:
	kernel/trace/trace.c:1186: warning: unused variable ‘last_ent’

Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/trace/trace.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fc20e09..8f3fb3d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1183,7 +1183,6 @@ static void *find_next_entry_inc(struct trace_iterator *iter)
 static void *s_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct trace_iterator *iter = m->private;
-	void *last_ent = iter->ent;
 	int i = (int)*pos;
 	void *ent;
 
-- 
cgit v1.1


From 5995477ab7f3522c497c9c4a1c55373e9d655574 Mon Sep 17 00:00:00 2001
From: Andrea Righi <righi.andrea@gmail.com>
Date: Sun, 27 Jul 2008 17:29:15 +0200
Subject: task IO accounting: improve code readability

Put all i/o statistics in struct proc_io_accounting and use inline functions to
initialize and increment statistics, removing a lot of single variable
assignments.

This also reduces the kernel size as following (with CONFIG_TASK_XACCT=y and
CONFIG_TASK_IO_ACCOUNTING=y).

    text    data     bss     dec     hex filename
   11651       0       0   11651    2d83 kernel/exit.o.before
   11619       0       0   11619    2d63 kernel/exit.o.after
   10886     132     136   11154    2b92 kernel/fork.o.before
   10758     132     136   11026    2b12 kernel/fork.o.after

 3082029  807968 4818600 8708597  84e1f5 vmlinux.o.before
 3081869  807968 4818600 8708437  84e155 vmlinux.o.after

Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Acked-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c   | 30 +++---------------------------
 kernel/fork.c   | 15 ++-------------
 kernel/tsacct.c | 14 +++++++-------
 3 files changed, 12 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 0caf590..eb4d647 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -121,18 +121,7 @@ static void __exit_signal(struct task_struct *tsk)
 		sig->nivcsw += tsk->nivcsw;
 		sig->inblock += task_io_get_inblock(tsk);
 		sig->oublock += task_io_get_oublock(tsk);
-#ifdef CONFIG_TASK_XACCT
-		sig->rchar += tsk->rchar;
-		sig->wchar += tsk->wchar;
-		sig->syscr += tsk->syscr;
-		sig->syscw += tsk->syscw;
-#endif /* CONFIG_TASK_XACCT */
-#ifdef CONFIG_TASK_IO_ACCOUNTING
-		sig->ioac.read_bytes += tsk->ioac.read_bytes;
-		sig->ioac.write_bytes += tsk->ioac.write_bytes;
-		sig->ioac.cancelled_write_bytes +=
-					tsk->ioac.cancelled_write_bytes;
-#endif /* CONFIG_TASK_IO_ACCOUNTING */
+		task_io_accounting_add(&sig->ioac, &tsk->ioac);
 		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 		sig = NULL; /* Marker for below. */
 	}
@@ -1363,21 +1352,8 @@ static int wait_task_zombie(struct task_struct *p, int options,
 		psig->coublock +=
 			task_io_get_oublock(p) +
 			sig->oublock + sig->coublock;
-#ifdef CONFIG_TASK_XACCT
-		psig->rchar += p->rchar + sig->rchar;
-		psig->wchar += p->wchar + sig->wchar;
-		psig->syscr += p->syscr + sig->syscr;
-		psig->syscw += p->syscw + sig->syscw;
-#endif /* CONFIG_TASK_XACCT */
-#ifdef CONFIG_TASK_IO_ACCOUNTING
-		psig->ioac.read_bytes +=
-			p->ioac.read_bytes + sig->ioac.read_bytes;
-		psig->ioac.write_bytes +=
-			p->ioac.write_bytes + sig->ioac.write_bytes;
-		psig->ioac.cancelled_write_bytes +=
-				p->ioac.cancelled_write_bytes +
-				sig->ioac.cancelled_write_bytes;
-#endif /* CONFIG_TASK_IO_ACCOUNTING */
+		task_io_accounting_add(&psig->ioac, &p->ioac);
+		task_io_accounting_add(&psig->ioac, &sig->ioac);
 		spin_unlock_irq(&p->parent->sighand->siglock);
 	}
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 5e050c1..8214ba7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -806,12 +806,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
-#ifdef CONFIG_TASK_XACCT
-	sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0;
-#endif
-#ifdef CONFIG_TASK_IO_ACCOUNTING
-	memset(&sig->ioac, 0, sizeof(sig->ioac));
-#endif
+	task_io_accounting_init(&sig->ioac);
 	sig->sum_sched_runtime = 0;
 	INIT_LIST_HEAD(&sig->cpu_timers[0]);
 	INIT_LIST_HEAD(&sig->cpu_timers[1]);
@@ -994,13 +989,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->last_switch_timestamp = 0;
 #endif
 
-#ifdef CONFIG_TASK_XACCT
-	p->rchar = 0;		/* I/O counter: bytes read */
-	p->wchar = 0;		/* I/O counter: bytes written */
-	p->syscr = 0;		/* I/O counter: read syscalls */
-	p->syscw = 0;		/* I/O counter: write syscalls */
-#endif
-	task_io_accounting_init(p);
+	task_io_accounting_init(&p->ioac);
 	acct_clear_integrals(p);
 
 	p->it_virt_expires = cputime_zero;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 3da47cc..f9cd256 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -94,14 +94,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 		stats->hiwater_vm    = mm->hiwater_vm * PAGE_SIZE / KB;
 		mmput(mm);
 	}
-	stats->read_char	= p->rchar;
-	stats->write_char	= p->wchar;
-	stats->read_syscalls	= p->syscr;
-	stats->write_syscalls	= p->syscw;
+	stats->read_char	= p->ioac.chr.rchar;
+	stats->write_char	= p->ioac.chr.wchar;
+	stats->read_syscalls	= p->ioac.chr.syscr;
+	stats->write_syscalls	= p->ioac.chr.syscw;
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-	stats->read_bytes	= p->ioac.read_bytes;
-	stats->write_bytes	= p->ioac.write_bytes;
-	stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
+	stats->read_bytes	= p->ioac.blk.read_bytes;
+	stats->write_bytes	= p->ioac.blk.write_bytes;
+	stats->cancelled_write_bytes = p->ioac.blk.cancelled_write_bytes;
 #else
 	stats->read_bytes	= 0;
 	stats->write_bytes	= 0;
-- 
cgit v1.1


From 940389b8afad6495211614c13eb91ef7001773ec Mon Sep 17 00:00:00 2001
From: Andrea Righi <righi.andrea@gmail.com>
Date: Mon, 28 Jul 2008 00:48:12 +0200
Subject: task IO accounting: move all IO statistics in struct
 task_io_accounting

Simplify the code of include/linux/task_io_accounting.h.

It is also more reasonable to have all the task i/o-related statistics in a
single struct (task_io_accounting).

Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/tsacct.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index f9cd256..8ebcd85 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -94,14 +94,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 		stats->hiwater_vm    = mm->hiwater_vm * PAGE_SIZE / KB;
 		mmput(mm);
 	}
-	stats->read_char	= p->ioac.chr.rchar;
-	stats->write_char	= p->ioac.chr.wchar;
-	stats->read_syscalls	= p->ioac.chr.syscr;
-	stats->write_syscalls	= p->ioac.chr.syscw;
+	stats->read_char	= p->ioac.rchar;
+	stats->write_char	= p->ioac.wchar;
+	stats->read_syscalls	= p->ioac.syscr;
+	stats->write_syscalls	= p->ioac.syscw;
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-	stats->read_bytes	= p->ioac.blk.read_bytes;
-	stats->write_bytes	= p->ioac.blk.write_bytes;
-	stats->cancelled_write_bytes = p->ioac.blk.cancelled_write_bytes;
+	stats->read_bytes	= p->ioac.read_bytes;
+	stats->write_bytes	= p->ioac.write_bytes;
+	stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
 #else
 	stats->read_bytes	= 0;
 	stats->write_bytes	= 0;
-- 
cgit v1.1


From 15bba37d62351749c3915add81f673b256952ee1 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Thu, 24 Jul 2008 15:41:48 +0100
Subject: module: fix build warning with !CONFIG_KALLSYMS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixed the warning:

  CC      kernel/module.o
  /home/wangcong/Projects/linux-2.6/kernel/module.c:332: warning:
‘lookup_symbol’ defined but not used

Signed-off-by: WANG Cong <wangcong@zeuux.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index d8b5605..d861bd5 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -325,18 +325,6 @@ static unsigned long find_symbol(const char *name,
 	return -ENOENT;
 }
 
-/* lookup symbol in given range of kernel_symbols */
-static const struct kernel_symbol *lookup_symbol(const char *name,
-	const struct kernel_symbol *start,
-	const struct kernel_symbol *stop)
-{
-	const struct kernel_symbol *ks = start;
-	for (; ks < stop; ks++)
-		if (strcmp(ks->name, name) == 0)
-			return ks;
-	return NULL;
-}
-
 /* Search for module by name: must hold module_mutex. */
 static struct module *find_module(const char *name)
 {
@@ -1703,6 +1691,19 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
 }
 
 #ifdef CONFIG_KALLSYMS
+
+/* lookup symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_symbol(const char *name,
+	const struct kernel_symbol *start,
+	const struct kernel_symbol *stop)
+{
+	const struct kernel_symbol *ks = start;
+	for (; ks < stop; ks++)
+		if (strcmp(ks->name, name) == 0)
+			return ks;
+	return NULL;
+}
+
 static int is_exported(const char *name, const struct module *mod)
 {
 	if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
-- 
cgit v1.1


From 5c2aed622571ac7c3c6ec182d6d3c318e4b45c8b Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 28 Feb 2008 11:33:03 -0500
Subject: stop_machine: add ALL_CPUS option

-allow stop_mahcine_run() to call a function on all cpus. Calling
 stop_machine_run() with a 'ALL_CPUS' invokes this new behavior.
 stop_machine_run() proceeds as normal until the calling cpu has
 invoked 'fn'. Then, we tell all the other cpus to call 'fn'.

Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
CC: Adrian Bunk <bunk@stusta.de>
CC: Andi Kleen <andi@firstfloor.org>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Christoph Hellwig <hch@infradead.org>
CC: mingo@elte.hu
CC: akpm@osdl.org
---
 kernel/stop_machine.c | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 738b411..a473bd0 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -22,9 +22,17 @@ enum stopmachine_state {
 	STOPMACHINE_WAIT,
 	STOPMACHINE_PREPARE,
 	STOPMACHINE_DISABLE_IRQ,
+	STOPMACHINE_RUN,
 	STOPMACHINE_EXIT,
 };
 
+struct stop_machine_data {
+	int (*fn)(void *);
+	void *data;
+	struct completion done;
+	int run_all;
+} smdata;
+
 static enum stopmachine_state stopmachine_state;
 static unsigned int stopmachine_num_threads;
 static atomic_t stopmachine_thread_ack;
@@ -33,6 +41,7 @@ static int stopmachine(void *cpu)
 {
 	int irqs_disabled = 0;
 	int prepared = 0;
+	int ran = 0;
 	cpumask_of_cpu_ptr(cpumask, (int)(long)cpu);
 
 	set_cpus_allowed_ptr(current, cpumask);
@@ -58,6 +67,11 @@ static int stopmachine(void *cpu)
 			prepared = 1;
 			smp_mb(); /* Must read state first. */
 			atomic_inc(&stopmachine_thread_ack);
+		} else if (stopmachine_state == STOPMACHINE_RUN && !ran) {
+			smdata.fn(smdata.data);
+			ran = 1;
+			smp_mb(); /* Must read state first. */
+			atomic_inc(&stopmachine_thread_ack);
 		}
 		/* Yield in first stage: migration threads need to
 		 * help our sisters onto their CPUs. */
@@ -136,11 +150,10 @@ static void restart_machine(void)
 	preempt_enable_no_resched();
 }
 
-struct stop_machine_data {
-	int (*fn)(void *);
-	void *data;
-	struct completion done;
-};
+static void run_other_cpus(void)
+{
+	stopmachine_set_state(STOPMACHINE_RUN);
+}
 
 static int do_stop(void *_smdata)
 {
@@ -150,6 +163,8 @@ static int do_stop(void *_smdata)
 	ret = stop_machine();
 	if (ret == 0) {
 		ret = smdata->fn(smdata->data);
+		if (smdata->run_all)
+			run_other_cpus();
 		restart_machine();
 	}
 
@@ -173,14 +188,17 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
 	struct stop_machine_data smdata;
 	struct task_struct *p;
 
+	mutex_lock(&stopmachine_mutex);
+
 	smdata.fn = fn;
 	smdata.data = data;
+	smdata.run_all = (cpu == ALL_CPUS) ? 1 : 0;
 	init_completion(&smdata.done);
 
-	mutex_lock(&stopmachine_mutex);
+	smp_wmb(); /* make sure other cpus see smdata updates */
 
 	/* If they don't care which CPU fn runs on, bind to any online one. */
-	if (cpu == NR_CPUS)
+	if (cpu == NR_CPUS || cpu == ALL_CPUS)
 		cpu = raw_smp_processor_id();
 
 	p = kthread_create(do_stop, &smdata, "kstopmachine");
-- 
cgit v1.1


From ffdb5976c47609c862917d4c186ecbb5706d2dda Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 28 Jul 2008 12:16:28 -0500
Subject: Simplify stop_machine

stop_machine creates a kthread which creates kernel threads.  We can
create those threads directly and simplify things a little.  Some care
must be taken with CPU hotunplug, which has special needs, but that code
seems more robust than it was in the past.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 kernel/cpu.c          |  13 +--
 kernel/stop_machine.c | 293 +++++++++++++++++++++-----------------------------
 2 files changed, 128 insertions(+), 178 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 10ba5f1..cf79bb9 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -216,7 +216,6 @@ static int __ref take_cpu_down(void *_param)
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
 	int err, nr_calls = 0;
-	struct task_struct *p;
 	cpumask_t old_allowed, tmp;
 	void *hcpu = (void *)(long)cpu;
 	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
@@ -250,19 +249,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	cpu_clear(cpu, tmp);
 	set_cpus_allowed_ptr(current, &tmp);
 
-	p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
+	err = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
 
-	if (IS_ERR(p) || cpu_online(cpu)) {
+	if (err || cpu_online(cpu)) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
 					    hcpu) == NOTIFY_BAD)
 			BUG();
 
-		if (IS_ERR(p)) {
-			err = PTR_ERR(p);
-			goto out_allowed;
-		}
-		goto out_thread;
+		goto out_allowed;
 	}
 
 	/* Wait for it to sleep (leaving idle task). */
@@ -279,8 +274,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 
 	check_for_tasks(cpu);
 
-out_thread:
-	err = kthread_stop(p);
 out_allowed:
 	set_cpus_allowed_ptr(current, &old_allowed);
 out_release:
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index a473bd0..35882dc 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,4 +1,4 @@
-/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
+/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
  * GPL v2 and any later version.
  */
 #include <linux/cpu.h>
@@ -13,220 +13,177 @@
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
 
-/* Since we effect priority and affinity (both of which are visible
- * to, and settable by outside processes) we do indirection via a
- * kthread. */
-
-/* Thread to stop each CPU in user context. */
+/* This controls the threads on each CPU. */
 enum stopmachine_state {
-	STOPMACHINE_WAIT,
+	/* Dummy starting state for thread. */
+	STOPMACHINE_NONE,
+	/* Awaiting everyone to be scheduled. */
 	STOPMACHINE_PREPARE,
+	/* Disable interrupts. */
 	STOPMACHINE_DISABLE_IRQ,
+	/* Run the function */
 	STOPMACHINE_RUN,
+	/* Exit */
 	STOPMACHINE_EXIT,
 };
+static enum stopmachine_state state;
 
 struct stop_machine_data {
 	int (*fn)(void *);
 	void *data;
-	struct completion done;
-	int run_all;
-} smdata;
+	int fnret;
+};
 
-static enum stopmachine_state stopmachine_state;
-static unsigned int stopmachine_num_threads;
-static atomic_t stopmachine_thread_ack;
+/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+static unsigned int num_threads;
+static atomic_t thread_ack;
+static struct completion finished;
+static DEFINE_MUTEX(lock);
 
-static int stopmachine(void *cpu)
+static void set_state(enum stopmachine_state newstate)
 {
-	int irqs_disabled = 0;
-	int prepared = 0;
-	int ran = 0;
-	cpumask_of_cpu_ptr(cpumask, (int)(long)cpu);
-
-	set_cpus_allowed_ptr(current, cpumask);
-
-	/* Ack: we are alive */
-	smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
-	atomic_inc(&stopmachine_thread_ack);
-
-	/* Simple state machine */
-	while (stopmachine_state != STOPMACHINE_EXIT) {
-		if (stopmachine_state == STOPMACHINE_DISABLE_IRQ 
-		    && !irqs_disabled) {
-			local_irq_disable();
-			hard_irq_disable();
-			irqs_disabled = 1;
-			/* Ack: irqs disabled. */
-			smp_mb(); /* Must read state first. */
-			atomic_inc(&stopmachine_thread_ack);
-		} else if (stopmachine_state == STOPMACHINE_PREPARE
-			   && !prepared) {
-			/* Everyone is in place, hold CPU. */
-			preempt_disable();
-			prepared = 1;
-			smp_mb(); /* Must read state first. */
-			atomic_inc(&stopmachine_thread_ack);
-		} else if (stopmachine_state == STOPMACHINE_RUN && !ran) {
-			smdata.fn(smdata.data);
-			ran = 1;
-			smp_mb(); /* Must read state first. */
-			atomic_inc(&stopmachine_thread_ack);
-		}
-		/* Yield in first stage: migration threads need to
-		 * help our sisters onto their CPUs. */
-		if (!prepared && !irqs_disabled)
-			yield();
-		cpu_relax();
-	}
-
-	/* Ack: we are exiting. */
-	smp_mb(); /* Must read state first. */
-	atomic_inc(&stopmachine_thread_ack);
-
-	if (irqs_disabled)
-		local_irq_enable();
-	if (prepared)
-		preempt_enable();
-
-	return 0;
+	/* Reset ack counter. */
+	atomic_set(&thread_ack, num_threads);
+	smp_wmb();
+	state = newstate;
 }
 
-/* Change the thread state */
-static void stopmachine_set_state(enum stopmachine_state state)
+/* Last one to ack a state moves to the next state. */
+static void ack_state(void)
 {
-	atomic_set(&stopmachine_thread_ack, 0);
-	smp_wmb();
-	stopmachine_state = state;
-	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
-		cpu_relax();
+	if (atomic_dec_and_test(&thread_ack)) {
+		/* If we're the last one to ack the EXIT, we're finished. */
+		if (state == STOPMACHINE_EXIT)
+			complete(&finished);
+		else
+			set_state(state + 1);
+	}
 }
 
-static int stop_machine(void)
+/* This is the actual thread which stops the CPU.  It exits by itself rather
+ * than waiting for kthread_stop(), because it's easier for hotplug CPU. */
+static int stop_cpu(struct stop_machine_data *smdata)
 {
-	int i, ret = 0;
-
-	atomic_set(&stopmachine_thread_ack, 0);
-	stopmachine_num_threads = 0;
-	stopmachine_state = STOPMACHINE_WAIT;
+	enum stopmachine_state curstate = STOPMACHINE_NONE;
+	int uninitialized_var(ret);
 
-	for_each_online_cpu(i) {
-		if (i == raw_smp_processor_id())
-			continue;
-		ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
-		if (ret < 0)
-			break;
-		stopmachine_num_threads++;
-	}
-
-	/* Wait for them all to come to life. */
-	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) {
-		yield();
+	/* Simple state machine */
+	do {
+		/* Chill out and ensure we re-read stopmachine_state. */
 		cpu_relax();
-	}
-
-	/* If some failed, kill them all. */
-	if (ret < 0) {
-		stopmachine_set_state(STOPMACHINE_EXIT);
-		return ret;
-	}
-
-	/* Now they are all started, make them hold the CPUs, ready. */
-	preempt_disable();
-	stopmachine_set_state(STOPMACHINE_PREPARE);
-
-	/* Make them disable irqs. */
-	local_irq_disable();
-	hard_irq_disable();
-	stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
-
-	return 0;
-}
+		if (state != curstate) {
+			curstate = state;
+			switch (curstate) {
+			case STOPMACHINE_DISABLE_IRQ:
+				local_irq_disable();
+				hard_irq_disable();
+				break;
+			case STOPMACHINE_RUN:
+				/* |= allows error detection if functions on
+				 * multiple CPUs. */
+				smdata->fnret |= smdata->fn(smdata->data);
+				break;
+			default:
+				break;
+			}
+			ack_state();
+		}
+	} while (curstate != STOPMACHINE_EXIT);
 
-static void restart_machine(void)
-{
-	stopmachine_set_state(STOPMACHINE_EXIT);
 	local_irq_enable();
-	preempt_enable_no_resched();
+	do_exit(0);
 }
 
-static void run_other_cpus(void)
+/* Callback for CPUs which aren't supposed to do anything. */
+static int chill(void *unused)
 {
-	stopmachine_set_state(STOPMACHINE_RUN);
+	return 0;
 }
 
-static int do_stop(void *_smdata)
+int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
 {
-	struct stop_machine_data *smdata = _smdata;
-	int ret;
+	int i, err;
+	struct stop_machine_data active, idle;
+	struct task_struct **threads;
+
+	active.fn = fn;
+	active.data = data;
+	active.fnret = 0;
+	idle.fn = chill;
+	idle.data = NULL;
+
+	/* If they don't care which cpu fn runs on, just pick one. */
+	if (cpu == NR_CPUS)
+		cpu = any_online_cpu(cpu_online_map);
+
+	/* This could be too big for stack on large machines. */
+	threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL);
+	if (!threads)
+		return -ENOMEM;
+
+	/* Set up initial state. */
+	mutex_lock(&lock);
+	init_completion(&finished);
+	num_threads = num_online_cpus();
+	set_state(STOPMACHINE_PREPARE);
 
-	ret = stop_machine();
-	if (ret == 0) {
-		ret = smdata->fn(smdata->data);
-		if (smdata->run_all)
-			run_other_cpus();
-		restart_machine();
-	}
+	for_each_online_cpu(i) {
+		struct stop_machine_data *smdata;
+		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 
-	/* We're done: you can kthread_stop us now */
-	complete(&smdata->done);
+		if (cpu == ALL_CPUS || i == cpu)
+			smdata = &active;
+		else
+			smdata = &idle;
+
+		threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u",
+					    i);
+		if (IS_ERR(threads[i])) {
+			err = PTR_ERR(threads[i]);
+			threads[i] = NULL;
+			goto kill_threads;
+		}
 
-	/* Wait for kthread_stop */
-	set_current_state(TASK_INTERRUPTIBLE);
-	while (!kthread_should_stop()) {
-		schedule();
-		set_current_state(TASK_INTERRUPTIBLE);
-	}
-	__set_current_state(TASK_RUNNING);
-	return ret;
-}
+		/* Place it onto correct cpu. */
+		kthread_bind(threads[i], i);
 
-struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
-				       unsigned int cpu)
-{
-	static DEFINE_MUTEX(stopmachine_mutex);
-	struct stop_machine_data smdata;
-	struct task_struct *p;
+		/* Make it highest prio. */
+		if (sched_setscheduler_nocheck(threads[i], SCHED_FIFO, &param))
+			BUG();
+	}
 
-	mutex_lock(&stopmachine_mutex);
+	/* We've created all the threads.  Wake them all: hold this CPU so one
+	 * doesn't hit this CPU until we're ready. */
+	cpu = get_cpu();
+	for_each_online_cpu(i)
+		wake_up_process(threads[i]);
 
-	smdata.fn = fn;
-	smdata.data = data;
-	smdata.run_all = (cpu == ALL_CPUS) ? 1 : 0;
-	init_completion(&smdata.done);
+	/* This will release the thread on our CPU. */
+	put_cpu();
+	wait_for_completion(&finished);
+	mutex_unlock(&lock);
 
-	smp_wmb(); /* make sure other cpus see smdata updates */
+	kfree(threads);
 
-	/* If they don't care which CPU fn runs on, bind to any online one. */
-	if (cpu == NR_CPUS || cpu == ALL_CPUS)
-		cpu = raw_smp_processor_id();
+	return active.fnret;
 
-	p = kthread_create(do_stop, &smdata, "kstopmachine");
-	if (!IS_ERR(p)) {
-		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+kill_threads:
+	for_each_online_cpu(i)
+		if (threads[i])
+			kthread_stop(threads[i]);
+	mutex_unlock(&lock);
 
-		/* One high-prio thread per cpu.  We'll do this one. */
-		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
-		kthread_bind(p, cpu);
-		wake_up_process(p);
-		wait_for_completion(&smdata.done);
-	}
-	mutex_unlock(&stopmachine_mutex);
-	return p;
+	kfree(threads);
+	return err;
 }
 
 int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
 {
-	struct task_struct *p;
 	int ret;
 
 	/* No CPUs can come up or down during this. */
 	get_online_cpus();
-	p = __stop_machine_run(fn, data, cpu);
-	if (!IS_ERR(p))
-		ret = kthread_stop(p);
-	else
-		ret = PTR_ERR(p);
+	ret = __stop_machine_run(fn, data, cpu);
 	put_online_cpus();
 
 	return ret;
-- 
cgit v1.1


From 04321587584272f4e8b9818f319f40caf8eeee13 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 28 Jul 2008 12:16:29 -0500
Subject: Hotplug CPU: don't check cpu_online after take_cpu_down

Akinobu points out that if take_cpu_down() succeeds, the cpu must be offline.
Remove the cpu_online() check, and put a BUG_ON().

Quoting Akinobu Mita:
   Actually the cpu_online() check was necessary before appling this
   stop_machine: simplify patch.

   With old __stop_machine_run(), __stop_machine_run() could succeed
   (return !IS_ERR(p) value) even if take_cpu_down() returned non-zero value.
   The return value of take_cpu_down() was obtained through kthread_stop()..

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: "Akinobu Mita" <akinobu.mita@gmail.com>
---
 kernel/cpu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index cf79bb9..53cf508 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -250,8 +250,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	set_cpus_allowed_ptr(current, &tmp);
 
 	err = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
-
-	if (err || cpu_online(cpu)) {
+	if (err) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
 					    hcpu) == NOTIFY_BAD)
@@ -259,6 +258,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 
 		goto out_allowed;
 	}
+	BUG_ON(cpu_online(cpu));
 
 	/* Wait for it to sleep (leaving idle task). */
 	while (!idle_cpu(cpu))
-- 
cgit v1.1


From eeec4fad963490821348a331cca6102ae1c4a7a3 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 28 Jul 2008 12:16:30 -0500
Subject: stop_machine(): stop_machine_run() changed to use cpu mask

Instead of a "cpu" arg with magic values NR_CPUS (any cpu) and ~0 (all
cpus), pass a cpumask_t.  Allow NULL for the common case (where we
don't care which CPU the function is run on): temporary cpumask_t's
are usually considered bad for stack space.

This deprecates stop_machine_run, to be removed soon when all the
callers are dead.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/cpu.c          |  3 ++-
 kernel/stop_machine.c | 27 +++++++++++++--------------
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 53cf508..29510d68 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -248,8 +248,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	cpus_setall(tmp);
 	cpu_clear(cpu, tmp);
 	set_cpus_allowed_ptr(current, &tmp);
+	tmp = cpumask_of_cpu(cpu);
 
-	err = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
+	err = __stop_machine(take_cpu_down, &tcd_param, &tmp);
 	if (err) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 35882dc..e446c7c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -100,7 +100,7 @@ static int chill(void *unused)
 	return 0;
 }
 
-int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
+int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
 {
 	int i, err;
 	struct stop_machine_data active, idle;
@@ -112,10 +112,6 @@ int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
 	idle.fn = chill;
 	idle.data = NULL;
 
-	/* If they don't care which cpu fn runs on, just pick one. */
-	if (cpu == NR_CPUS)
-		cpu = any_online_cpu(cpu_online_map);
-
 	/* This could be too big for stack on large machines. */
 	threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL);
 	if (!threads)
@@ -128,13 +124,16 @@ int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
 	set_state(STOPMACHINE_PREPARE);
 
 	for_each_online_cpu(i) {
-		struct stop_machine_data *smdata;
+		struct stop_machine_data *smdata = &idle;
 		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 
-		if (cpu == ALL_CPUS || i == cpu)
-			smdata = &active;
-		else
-			smdata = &idle;
+		if (!cpus) {
+			if (i == first_cpu(cpu_online_map))
+				smdata = &active;
+		} else {
+			if (cpu_isset(i, *cpus))
+				smdata = &active;
+		}
 
 		threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u",
 					    i);
@@ -154,7 +153,7 @@ int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
 
 	/* We've created all the threads.  Wake them all: hold this CPU so one
 	 * doesn't hit this CPU until we're ready. */
-	cpu = get_cpu();
+	get_cpu();
 	for_each_online_cpu(i)
 		wake_up_process(threads[i]);
 
@@ -177,15 +176,15 @@ kill_threads:
 	return err;
 }
 
-int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
+int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
 {
 	int ret;
 
 	/* No CPUs can come up or down during this. */
 	get_online_cpus();
-	ret = __stop_machine_run(fn, data, cpu);
+	ret = __stop_machine(fn, data, cpus);
 	put_online_cpus();
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(stop_machine_run);
+EXPORT_SYMBOL_GPL(stop_machine);
-- 
cgit v1.1


From 9b1a4d38373a5581a4e01032a3ccdd94cd93477b Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 28 Jul 2008 12:16:30 -0500
Subject: stop_machine: Wean existing callers off stop_machine_run()

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c     | 8 ++++----
 kernel/rcuclassic.c | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index d861bd5..61d2121 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -678,7 +678,7 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
 	if (flags & O_NONBLOCK) {
 		struct stopref sref = { mod, flags, forced };
 
-		return stop_machine_run(__try_stop_module, &sref, NR_CPUS);
+		return stop_machine(__try_stop_module, &sref, NULL);
 	} else {
 		/* We don't need to stop the machine for this. */
 		mod->state = MODULE_STATE_GOING;
@@ -1416,7 +1416,7 @@ static int __unlink_module(void *_mod)
 static void free_module(struct module *mod)
 {
 	/* Delete from various lists */
-	stop_machine_run(__unlink_module, mod, NR_CPUS);
+	stop_machine(__unlink_module, mod, NULL);
 	remove_notes_attrs(mod);
 	remove_sect_attrs(mod);
 	mod_kobject_remove(mod);
@@ -2197,7 +2197,7 @@ static struct module *load_module(void __user *umod,
 	/* Now sew it into the lists so we can get lockdep and oops
          * info during argument parsing.  Noone should access us, since
          * strong_try_module_get() will fail. */
-	stop_machine_run(__link_module, mod, NR_CPUS);
+	stop_machine(__link_module, mod, NULL);
 
 	/* Size of section 0 is 0, so this works well if no params */
 	err = parse_args(mod->name, mod->args,
@@ -2231,7 +2231,7 @@ static struct module *load_module(void __user *umod,
 	return mod;
 
  unlink:
-	stop_machine_run(__unlink_module, mod, NR_CPUS);
+	stop_machine(__unlink_module, mod, NULL);
 	module_arch_cleanup(mod);
  cleanup:
 	kobject_del(&mod->mkobj.kobj);
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 6f8696c..aad93cd 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -91,8 +91,8 @@ static void force_quiescent_state(struct rcu_data *rdp,
 		 * rdp->cpu is the current cpu.
 		 *
 		 * cpu_online_map is updated by the _cpu_down()
-		 * using stop_machine_run(). Since we're in irqs disabled
-		 * section, stop_machine_run() is not exectuting, hence
+		 * using __stop_machine(). Since we're in irqs disabled
+		 * section, __stop_machine() is not exectuting, hence
 		 * the cpu_online_map is stable.
 		 *
 		 * However,  a cpu might have been offlined _just_ before
-- 
cgit v1.1


From 784e2d76007f90d69341b95967160c4fb7829299 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 28 Jul 2008 12:16:31 -0500
Subject: stop_machine: fix up ftrace.c

Simple conversion.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Abhishek Sagar <sagar.abhishek@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4231a3d..f6e3af3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -587,7 +587,7 @@ static int __ftrace_modify_code(void *data)
 
 static void ftrace_run_update_code(int command)
 {
-	stop_machine_run(__ftrace_modify_code, &command, NR_CPUS);
+	stop_machine(__ftrace_modify_code, &command, NULL);
 }
 
 void ftrace_disable_daemon(void)
@@ -787,7 +787,7 @@ static int ftrace_update_code(void)
 	    !ftrace_enabled || !ftraced_trigger)
 		return 0;
 
-	stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
+	stop_machine(__ftrace_update_code, NULL, NULL);
 
 	return 1;
 }
@@ -1564,7 +1564,7 @@ static int __init ftrace_dynamic_init(void)
 
 	addr = (unsigned long)ftrace_record_ip;
 
-	stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS);
+	stop_machine(ftrace_dyn_arch_init, &addr, NULL);
 
 	/* ftrace_dyn_arch_init places the return code in addr */
 	if (addr) {
-- 
cgit v1.1


From 0e241ffd306c0896bb9959be7faa4d4cfcb706d9 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 24 Jul 2008 16:58:42 -0700
Subject: locking: fix mutex @key parameter kernel-doc notation

Fix @key parameter to mutex_init() and one of its callers.

Warning(linux-2.6.26-git11//drivers/base/class.c:210): No description found for parameter 'key'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/mutex.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/mutex.c b/kernel/mutex.c
index bcdc9ac..12c779d 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -34,6 +34,7 @@
 /***
  * mutex_init - initialize the mutex
  * @lock: the mutex to be initialized
+ * @key: the lock_class_key for the class; used by mutex lock debugging
  *
  * Initialize the mutex to unlocked state.
  *
-- 
cgit v1.1


From e56b3bc7942982ac2589c942fb345e38bc7a341a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 28 Jul 2008 11:32:33 -0700
Subject: cpu masks: optimize and clean up cpumask_of_cpu()

Clean up and optimize cpumask_of_cpu(), by sharing all the zero words.

Instead of stupidly generating all possible i=0...NR_CPUS 2^i patterns
creating a huge array of constant bitmasks, realize that the zero words
can be shared.

In other words, on a 64-bit architecture, we only ever need 64 of these
arrays - with a different bit set in one single world (with enough zero
words around it so that we can create any bitmask by just offsetting in
that big array). And then we just put enough zeroes around it that we
can point every single cpumask to be one of those things.

So when we have 4k CPU's, instead of having 4k arrays (of 4k bits each,
with one bit set in each array - 2MB memory total), we have exactly 64
arrays instead, each 8k bits in size (64kB total).

And then we just point cpumask(n) to the right position (which we can
calculate dynamically). Once we have the right arrays, getting
"cpumask(n)" ends up being:

  static inline const cpumask_t *get_cpu_mask(unsigned int cpu)
  {
          const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
          p -= cpu / BITS_PER_LONG;
          return (const cpumask_t *)p;
  }

This brings other advantages and simplifications as well:

 - we are not wasting memory that is just filled with a single bit in
   various different places

 - we don't need all those games to re-create the arrays in some dense
   format, because they're already going to be dense enough.

if we compile a kernel for up to 4k CPU's, "wasting" that 64kB of memory
is a non-issue (especially since by doing this "overlapping" trick we
probably get better cache behaviour anyway).

[ mingo@elte.hu:

  Converted Linus's mails into a commit. See:

     http://lkml.org/lkml/2008/7/27/156
     http://lkml.org/lkml/2008/7/28/320

  Also applied a family filter - which also has the side-effect of leaving
  out the bits where Linus calls me an idio... Oh, never mind ;-)
]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpu.c | 128 ++++++++++-------------------------------------------------
 1 file changed, 20 insertions(+), 108 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index a35d899..06a8358 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -462,115 +462,27 @@ out:
 
 #endif /* CONFIG_SMP */
 
-/* 64 bits of zeros, for initializers. */
-#if BITS_PER_LONG == 32
-#define Z64 0, 0
-#else
-#define Z64 0
-#endif
+/*
+ * cpu_bit_bitmap[] is a special, "compressed" data structure that
+ * represents all NR_CPUS bits binary values of 1<<nr.
+ *
+ * It is used by cpumask_of_cpu() to get a constant address to a CPU
+ * mask value that has a single bit set only.
+ */
 
-/* Initializer macros. */
-#define CMI0(n) { .bits = { 1UL << (n) } }
-#define CMI(n, ...) { .bits = { __VA_ARGS__, 1UL << ((n) % BITS_PER_LONG) } }
-
-#define CMI8(n, ...)						\
-	CMI((n), __VA_ARGS__), CMI((n)+1, __VA_ARGS__),		\
-	CMI((n)+2, __VA_ARGS__), CMI((n)+3, __VA_ARGS__),	\
-	CMI((n)+4, __VA_ARGS__), CMI((n)+5, __VA_ARGS__),	\
-	CMI((n)+6, __VA_ARGS__), CMI((n)+7, __VA_ARGS__)
-
-#if BITS_PER_LONG == 32
-#define CMI64(n, ...)							\
-	CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__),		\
-	CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__),		\
-	CMI8((n)+32, 0, __VA_ARGS__), CMI8((n)+40, 0, __VA_ARGS__),	\
-	CMI8((n)+48, 0, __VA_ARGS__), CMI8((n)+56, 0, __VA_ARGS__)
-#else
-#define CMI64(n, ...)							\
-	CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__),		\
-	CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__),		\
-	CMI8((n)+32, __VA_ARGS__), CMI8((n)+40, __VA_ARGS__),	\
-	CMI8((n)+48, __VA_ARGS__), CMI8((n)+56, __VA_ARGS__)
-#endif
+/* cpu_bit_bitmap[0] is empty - so we can back into it */
+#define MASK_DECLARE_1(x)	[x+1][0] = 1UL << (x)
+#define MASK_DECLARE_2(x)	MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
+#define MASK_DECLARE_4(x)	MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
+#define MASK_DECLARE_8(x)	MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
 
-#define CMI256(n, ...)							\
-	CMI64((n), __VA_ARGS__), CMI64((n)+64, Z64, __VA_ARGS__),	\
-	CMI64((n)+128, Z64, Z64, __VA_ARGS__),				\
-	CMI64((n)+192, Z64, Z64, Z64, __VA_ARGS__)
-#define Z256 Z64, Z64, Z64, Z64
-
-#define CMI1024(n, ...)					\
-	CMI256((n), __VA_ARGS__),			\
-	CMI256((n)+256, Z256, __VA_ARGS__),		\
-	CMI256((n)+512, Z256, Z256, __VA_ARGS__),	\
-	CMI256((n)+768, Z256, Z256, Z256, __VA_ARGS__)
-#define Z1024 Z256, Z256, Z256, Z256
-
-/* We want this statically initialized, just to be safe.  We try not
- * to waste too much space, either. */
-static const cpumask_t cpumask_map[]
-#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
-__initdata
-#endif
-= {
-	CMI0(0), CMI0(1), CMI0(2), CMI0(3),
-#if NR_CPUS > 4
-	CMI0(4), CMI0(5), CMI0(6), CMI0(7),
-#endif
-#if NR_CPUS > 8
-	CMI0(8), CMI0(9), CMI0(10), CMI0(11),
-	CMI0(12), CMI0(13), CMI0(14), CMI0(15),
-#endif
-#if NR_CPUS > 16
-	CMI0(16), CMI0(17), CMI0(18), CMI0(19),
-	CMI0(20), CMI0(21), CMI0(22), CMI0(23),
-	CMI0(24), CMI0(25), CMI0(26), CMI0(27),
-	CMI0(28), CMI0(29), CMI0(30), CMI0(31),
-#endif
-#if NR_CPUS > 32
-#if BITS_PER_LONG == 32
-	CMI(32, 0), CMI(33, 0), CMI(34, 0), CMI(35, 0),
-	CMI(36, 0), CMI(37, 0), CMI(38, 0), CMI(39, 0),
-	CMI(40, 0), CMI(41, 0), CMI(42, 0), CMI(43, 0),
-	CMI(44, 0), CMI(45, 0), CMI(46, 0), CMI(47, 0),
-	CMI(48, 0), CMI(49, 0), CMI(50, 0), CMI(51, 0),
-	CMI(52, 0), CMI(53, 0), CMI(54, 0), CMI(55, 0),
-	CMI(56, 0), CMI(57, 0), CMI(58, 0), CMI(59, 0),
-	CMI(60, 0), CMI(61, 0), CMI(62, 0), CMI(63, 0),
-#else
-	CMI0(32), CMI0(33), CMI0(34), CMI0(35),
-	CMI0(36), CMI0(37), CMI0(38), CMI0(39),
-	CMI0(40), CMI0(41), CMI0(42), CMI0(43),
-	CMI0(44), CMI0(45), CMI0(46), CMI0(47),
-	CMI0(48), CMI0(49), CMI0(50), CMI0(51),
-	CMI0(52), CMI0(53), CMI0(54), CMI0(55),
-	CMI0(56), CMI0(57), CMI0(58), CMI0(59),
-	CMI0(60), CMI0(61), CMI0(62), CMI0(63),
-#endif /* BITS_PER_LONG == 64 */
-#endif
-#if NR_CPUS > 64
-	CMI64(64, Z64),
-#endif
-#if NR_CPUS > 128
-	CMI64(128, Z64, Z64), CMI64(192, Z64, Z64, Z64),
-#endif
-#if NR_CPUS > 256
-	CMI256(256, Z256),
-#endif
-#if NR_CPUS > 512
-	CMI256(512, Z256, Z256), CMI256(768, Z256, Z256, Z256),
-#endif
-#if NR_CPUS > 1024
-	CMI1024(1024, Z1024),
-#endif
-#if NR_CPUS > 2048
-	CMI1024(2048, Z1024, Z1024), CMI1024(3072, Z1024, Z1024, Z1024),
-#endif
-#if NR_CPUS > 4096
-#error NR_CPUS too big.  Fix initializers or set CONFIG_HAVE_CPUMASK_OF_CPU_MAP
+const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
+
+	MASK_DECLARE_8(0),	MASK_DECLARE_8(8),
+	MASK_DECLARE_8(16),	MASK_DECLARE_8(24),
+#if BITS_PER_LONG > 32
+	MASK_DECLARE_8(32),	MASK_DECLARE_8(40),
+	MASK_DECLARE_8(48),	MASK_DECLARE_8(56),
 #endif
 };
-
-const cpumask_t *cpumask_of_cpu_map = cpumask_map;
-
-EXPORT_SYMBOL_GPL(cpumask_of_cpu_map);
+EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
-- 
cgit v1.1


From cddb8a5c14aa89810b40495d94d3d2a0faee6619 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <andrea@qumranet.com>
Date: Mon, 28 Jul 2008 15:46:29 -0700
Subject: mmu-notifiers: core

With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages.
 There are secondary MMUs (with secondary sptes and secondary tlbs) too.
sptes in the kvm case are shadow pagetables, but when I say spte in
mmu-notifier context, I mean "secondary pte".  In GRU case there's no
actual secondary pte and there's only a secondary tlb because the GRU
secondary MMU has no knowledge about sptes and every secondary tlb miss
event in the MMU always generates a page fault that has to be resolved by
the CPU (this is not the case of KVM where the a secondary tlb miss will
walk sptes in hardware and it will refill the secondary tlb transparently
to software if the corresponding spte is present).  The same way
zap_page_range has to invalidate the pte before freeing the page, the spte
(and secondary tlb) must also be invalidated before any page is freed and
reused.

Currently we take a page_count pin on every page mapped by sptes, but that
means the pages can't be swapped whenever they're mapped by any spte
because they're part of the guest working set.  Furthermore a spte unmap
event can immediately lead to a page to be freed when the pin is released
(so requiring the same complex and relatively slow tlb_gather smp safe
logic we have in zap_page_range and that can be avoided completely if the
spte unmap event doesn't require an unpin of the page previously mapped in
the secondary MMU).

The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know
when the VM is swapping or freeing or doing anything on the primary MMU so
that the secondary MMU code can drop sptes before the pages are freed,
avoiding all page pinning and allowing 100% reliable swapping of guest
physical address space.  Furthermore it avoids the code that teardown the
mappings of the secondary MMU, to implement a logic like tlb_gather in
zap_page_range that would require many IPI to flush other cpu tlbs, for
each fixed number of spte unmapped.

To make an example: if what happens on the primary MMU is a protection
downgrade (from writeable to wrprotect) the secondary MMU mappings will be
invalidated, and the next secondary-mmu-page-fault will call
get_user_pages and trigger a do_wp_page through get_user_pages if it
called get_user_pages with write=1, and it'll re-establishing an updated
spte or secondary-tlb-mapping on the copied page.  Or it will setup a
readonly spte or readonly tlb mapping if it's a guest-read, if it calls
get_user_pages with write=0.  This is just an example.

This allows to map any page pointed by any pte (and in turn visible in the
primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an
full MMU with both sptes and secondary-tlb like the shadow-pagetable layer
with kvm), or a remote DMA in software like XPMEM (hence needing of
schedule in XPMEM code to send the invalidate to the remote node, while no
need to schedule in kvm/gru as it's an immediate event like invalidating
primary-mmu pte).

At least for KVM without this patch it's impossible to swap guests
reliably.  And having this feature and removing the page pin allows
several other optimizations that simplify life considerably.

Dependencies:

1) mm_take_all_locks() to register the mmu notifier when the whole VM
   isn't doing anything with "mm".  This allows mmu notifier users to keep
   track if the VM is in the middle of the invalidate_range_begin/end
   critical section with an atomic counter incraese in range_begin and
   decreased in range_end.  No secondary MMU page fault is allowed to map
   any spte or secondary tlb reference, while the VM is in the middle of
   range_begin/end as any page returned by get_user_pages in that critical
   section could later immediately be freed without any further
   ->invalidate_page notification (invalidate_range_begin/end works on
   ranges and ->invalidate_page isn't called immediately before freeing
   the page).  To stop all page freeing and pagetable overwrites the
   mmap_sem must be taken in write mode and all other anon_vma/i_mmap
   locks must be taken too.

2) It'd be a waste to add branches in the VM if nobody could possibly
   run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if
   CONFIG_KVM=m/y.  In the current kernel kvm won't yet take advantage of
   mmu notifiers, but this already allows to compile a KVM external module
   against a kernel with mmu notifiers enabled and from the next pull from
   kvm.git we'll start using them.  And GRU/XPMEM will also be able to
   continue the development by enabling KVM=m in their config, until they
   submit all GRU/XPMEM GPLv2 code to the mainline kernel.  Then they can
   also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n).
   This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM
   are all =n.

The mmu_notifier_register call can fail because mm_take_all_locks may be
interrupted by a signal and return -EINTR.  Because mmu_notifier_reigster
is used when a driver startup, a failure can be gracefully handled.  Here
an example of the change applied to kvm to register the mmu notifiers.
Usually when a driver startups other allocations are required anyway and
-ENOMEM failure paths exists already.

 struct  kvm *kvm_arch_create_vm(void)
 {
        struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+       int err;

        if (!kvm)
                return ERR_PTR(-ENOMEM);

        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);

+       kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+       err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+       if (err) {
+               kfree(kvm);
+               return ERR_PTR(err);
+       }
+
        return kvm;
 }

mmu_notifier_unregister returns void and it's reliable.

The patch also adds a few needed but missing includes that would prevent
kernel to compile after these changes on non-x86 archs (x86 didn't need
them by luck).

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix mm/filemap_xip.c build]
[akpm@linux-foundation.org: fix mm/mmu_notifier.c build]
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Kanoj Sarcar <kanojsarcar@yahoo.com>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Chris Wright <chrisw@redhat.com>
Cc: Marcelo Tosatti <marcelo@kvack.org>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Izik Eidus <izike@qumranet.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 8214ba7..7ce2ebe 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -27,6 +27,7 @@
 #include <linux/key.h>
 #include <linux/binfmts.h>
 #include <linux/mman.h>
+#include <linux/mmu_notifier.h>
 #include <linux/fs.h>
 #include <linux/nsproxy.h>
 #include <linux/capability.h>
@@ -414,6 +415,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
+		mmu_notifier_mm_init(mm);
 		return mm;
 	}
 
@@ -446,6 +448,7 @@ void __mmdrop(struct mm_struct *mm)
 	BUG_ON(mm == &init_mm);
 	mm_free_pgd(mm);
 	destroy_context(mm);
+	mmu_notifier_mm_destroy(mm);
 	free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
-- 
cgit v1.1


From 1a4e564b7db999fbe5d88318c96ac8747699d417 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 29 Jul 2008 22:32:57 -0700
Subject: resource: add resource_size()

Avoid one-off errors by introducing a resource_size() function.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Cc: Ben Dooks <ben-linux@fluff.org>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/resource.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 74af2d7..f5b518e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -490,7 +490,7 @@ resource_size_t resource_alignment(struct resource *res)
 {
 	switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) {
 	case IORESOURCE_SIZEALIGN:
-		return res->end - res->start + 1;
+		return resource_size(res);
 	case IORESOURCE_STARTALIGN:
 		return res->start;
 	default:
-- 
cgit v1.1


From 5a3eb9f6b7c598529f832b8baa6458ab1cbab2c6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 29 Jul 2008 22:33:18 -0700
Subject: cgroup: fix possible memory leak

There's a leak if copy_from_user() returns failure.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 657f8f8..28debe4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1424,14 +1424,17 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
 		if (buffer == NULL)
 			return -ENOMEM;
 	}
-	if (nbytes && copy_from_user(buffer, userbuf, nbytes))
-		return -EFAULT;
+	if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
+		retval = -EFAULT;
+		goto out;
+	}
 
 	buffer[nbytes] = 0;     /* nul-terminate */
 	strstrip(buffer);
 	retval = cft->write_string(cgrp, cft, buffer);
 	if (!retval)
 		retval = nbytes;
+out:
 	if (buffer != local_buffer)
 		kfree(buffer);
 	return retval;
-- 
cgit v1.1


From 36553434f475a84b653e25e74490ee8df43b86d5 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 29 Jul 2008 22:33:19 -0700
Subject: cgroup: remove duplicate code in allocate_cg_link()

- just call free_cg_links() in allocate_cg_links()
- the list will get initialized in allocate_cg_links(), so don't init
  it twice

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 28debe4..249a517 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -355,6 +355,17 @@ static struct css_set *find_existing_css_set(
 	return NULL;
 }
 
+static void free_cg_links(struct list_head *tmp)
+{
+	struct cg_cgroup_link *link;
+	struct cg_cgroup_link *saved_link;
+
+	list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
+		list_del(&link->cgrp_link_list);
+		kfree(link);
+	}
+}
+
 /*
  * allocate_cg_links() allocates "count" cg_cgroup_link structures
  * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
@@ -363,17 +374,12 @@ static struct css_set *find_existing_css_set(
 static int allocate_cg_links(int count, struct list_head *tmp)
 {
 	struct cg_cgroup_link *link;
-	struct cg_cgroup_link *saved_link;
 	int i;
 	INIT_LIST_HEAD(tmp);
 	for (i = 0; i < count; i++) {
 		link = kmalloc(sizeof(*link), GFP_KERNEL);
 		if (!link) {
-			list_for_each_entry_safe(link, saved_link, tmp,
-						 cgrp_link_list) {
-				list_del(&link->cgrp_link_list);
-				kfree(link);
-			}
+			free_cg_links(tmp);
 			return -ENOMEM;
 		}
 		list_add(&link->cgrp_link_list, tmp);
@@ -381,17 +387,6 @@ static int allocate_cg_links(int count, struct list_head *tmp)
 	return 0;
 }
 
-static void free_cg_links(struct list_head *tmp)
-{
-	struct cg_cgroup_link *link;
-	struct cg_cgroup_link *saved_link;
-
-	list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
-		list_del(&link->cgrp_link_list);
-		kfree(link);
-	}
-}
-
 /*
  * find_css_set() takes an existing cgroup group and a
  * cgroup object, and returns a css_set object that's
@@ -956,7 +951,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 	struct super_block *sb;
 	struct cgroupfs_root *root;
 	struct list_head tmp_cg_links;
-	INIT_LIST_HEAD(&tmp_cg_links);
 
 	/* First find the desired set of subsystems */
 	ret = parse_cgroupfs_options(data, &opts);
-- 
cgit v1.1


From 55b6fd0162ace1e0f1b52c8c092565c115127ef6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 29 Jul 2008 22:33:20 -0700
Subject: cgroup: uninline cgroup_has_css_refs()

It's not small enough, and has 2 call sites.

 text    data     bss     dec     hex filename
12813    1676    4832   19321    4b79 cgroup.o.orig
12775    1676    4832   19283    4b53 cgroup.o

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 249a517..13932abd 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2368,7 +2368,7 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
 
-static inline int cgroup_has_css_refs(struct cgroup *cgrp)
+static int cgroup_has_css_refs(struct cgroup *cgrp)
 {
 	/* Check the reference count on each subsystem. Since we
 	 * already established that there are no tasks in the
-- 
cgit v1.1


From 8d1e6266f512b3a94ef6d33528ff385f1aea0392 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 29 Jul 2008 22:33:21 -0700
Subject: cpuset: a bit cleanup for scan_for_empty_cpusets()

clean up hierarchy traversal code

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Cliff Wickman <cpw@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 91cf85b..3624dc0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1833,24 +1833,21 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
  */
 static void scan_for_empty_cpusets(const struct cpuset *root)
 {
+	LIST_HEAD(queue);
 	struct cpuset *cp;	/* scans cpusets being updated */
 	struct cpuset *child;	/* scans child cpusets of cp */
-	struct list_head queue;
 	struct cgroup *cont;
 	nodemask_t oldmems;
 
-	INIT_LIST_HEAD(&queue);
-
 	list_add_tail((struct list_head *)&root->stack_list, &queue);
 
 	while (!list_empty(&queue)) {
-		cp = container_of(queue.next, struct cpuset, stack_list);
+		cp = list_first_entry(&queue, struct cpuset, stack_list);
 		list_del(queue.next);
 		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
 			child = cgroup_cs(cont);
 			list_add_tail(&child->stack_list, &queue);
 		}
-		cont = cp->css.cgroup;
 
 		/* Continue past cpusets with all cpus, mems online */
 		if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
-- 
cgit v1.1


From f5393693e96393131a4a2e2743f883986d508503 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 29 Jul 2008 22:33:22 -0700
Subject: cpuset: speed up sched domain partition

All child cpusets contain a subset of the parent's cpus, so we can skip
them when partitioning sched domains. This decreases 'csa' greately for
cpusets with multi-level hierarchy.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 41 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3624dc0..c818c36 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -486,13 +486,38 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 static void
 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
 {
-	if (!dattr)
-		return;
 	if (dattr->relax_domain_level < c->relax_domain_level)
 		dattr->relax_domain_level = c->relax_domain_level;
 	return;
 }
 
+static void
+update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
+{
+	LIST_HEAD(q);
+
+	list_add(&c->stack_list, &q);
+	while (!list_empty(&q)) {
+		struct cpuset *cp;
+		struct cgroup *cont;
+		struct cpuset *child;
+
+		cp = list_first_entry(&q, struct cpuset, stack_list);
+		list_del(q.next);
+
+		if (cpus_empty(cp->cpus_allowed))
+			continue;
+
+		if (is_sched_load_balance(cp))
+			update_domain_attr(dattr, cp);
+
+		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+			child = cgroup_cs(cont);
+			list_add_tail(&child->stack_list, &q);
+		}
+	}
+}
+
 /*
  * rebuild_sched_domains()
  *
@@ -614,8 +639,16 @@ void rebuild_sched_domains(void)
 		if (cpus_empty(cp->cpus_allowed))
 			continue;
 
-		if (is_sched_load_balance(cp))
+		/*
+		 * All child cpusets contain a subset of the parent's cpus, so
+		 * just skip them, and then we call update_domain_attr_tree()
+		 * to calc relax_domain_level of the corresponding sched
+		 * domain.
+		 */
+		if (is_sched_load_balance(cp)) {
 			csa[csn++] = cp;
+			continue;
+		}
 
 		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
 			child = cgroup_cs(cont);
@@ -686,7 +719,7 @@ restart:
 					cpus_or(*dp, *dp, b->cpus_allowed);
 					b->pn = -1;
 					if (dattr)
-						update_domain_attr(dattr
+						update_domain_attr_tree(dattr
 								   + nslot, b);
 				}
 			}
-- 
cgit v1.1


From 93a6557558a13f9ff35213efeca483f353c39dd3 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 29 Jul 2008 22:33:23 -0700
Subject: cpuset: fix wrong calculation of relax domain level

When multiple cpusets are overlapping in their 'cpus' and hence they
form a single sched domain, the largest sched_relax_domain_level among
those should be used. But when top_cpuset's sched_load_balance is
set, its sched_relax_domain_level is used regardless other sub-cpusets'.

This patch fixes it by walking the cpuset hierarchy to find the largest
sched_relax_domain_level.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c818c36..7a82e90 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -616,7 +616,7 @@ void rebuild_sched_domains(void)
 		dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
 		if (dattr) {
 			*dattr = SD_ATTR_INIT;
-			update_domain_attr(dattr, &top_cpuset);
+			update_domain_attr_tree(dattr, &top_cpuset);
 		}
 		*doms = top_cpuset.cpus_allowed;
 		goto rebuild;
-- 
cgit v1.1


From aeed682421a5ebfbf46940e30c3d1caf3bc64304 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 29 Jul 2008 22:33:24 -0700
Subject: cpuset: clean up cpuset hierarchy traversal code

Use cpuset.stack_list rather than kfifo, so we avoid memory allocation
for kfifo.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7a82e90..d5ab79c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -54,7 +54,6 @@
 #include <asm/uaccess.h>
 #include <asm/atomic.h>
 #include <linux/mutex.h>
-#include <linux/kfifo.h>
 #include <linux/workqueue.h>
 #include <linux/cgroup.h>
 
@@ -557,7 +556,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
  * So the reverse nesting would risk an ABBA deadlock.
  *
  * The three key local variables below are:
- *    q  - a kfifo queue of cpuset pointers, used to implement a
+ *    q  - a linked-list queue of cpuset pointers, used to implement a
  *	   top-down scan of all cpusets.  This scan loads a pointer
  *	   to each cpuset marked is_sched_load_balance into the
  *	   array 'csa'.  For our purposes, rebuilding the schedulers
@@ -592,7 +591,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 
 void rebuild_sched_domains(void)
 {
-	struct kfifo *q;	/* queue of cpusets to be scanned */
+	LIST_HEAD(q);		/* queue of cpusets to be scanned*/
 	struct cpuset *cp;	/* scans q */
 	struct cpuset **csa;	/* array of all cpuset ptrs */
 	int csn;		/* how many cpuset ptrs in csa so far */
@@ -602,7 +601,6 @@ void rebuild_sched_domains(void)
 	int ndoms;		/* number of sched domains in result */
 	int nslot;		/* next empty doms[] cpumask_t slot */
 
-	q = NULL;
 	csa = NULL;
 	doms = NULL;
 	dattr = NULL;
@@ -622,20 +620,19 @@ void rebuild_sched_domains(void)
 		goto rebuild;
 	}
 
-	q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL);
-	if (IS_ERR(q))
-		goto done;
 	csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
 	if (!csa)
 		goto done;
 	csn = 0;
 
-	cp = &top_cpuset;
-	__kfifo_put(q, (void *)&cp, sizeof(cp));
-	while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
+	list_add(&top_cpuset.stack_list, &q);
+	while (!list_empty(&q)) {
 		struct cgroup *cont;
 		struct cpuset *child;   /* scans child cpusets of cp */
 
+		cp = list_first_entry(&q, struct cpuset, stack_list);
+		list_del(q.next);
+
 		if (cpus_empty(cp->cpus_allowed))
 			continue;
 
@@ -652,7 +649,7 @@ void rebuild_sched_domains(void)
 
 		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
 			child = cgroup_cs(cont);
-			__kfifo_put(q, (void *)&child, sizeof(cp));
+			list_add_tail(&child->stack_list, &q);
 		}
   	}
 
@@ -735,8 +732,6 @@ rebuild:
 	put_online_cpus();
 
 done:
-	if (q && !IS_ERR(q))
-		kfifo_free(q);
 	kfree(csa);
 	/* Don't kfree(doms) -- partition_sched_domains() does that. */
 	/* Don't kfree(dattr) -- partition_sched_domains() does that. */
-- 
cgit v1.1


From 5def9a3a22e09c99717f41ab7f07ec9e1a1f3ec8 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Tue, 29 Jul 2008 22:33:31 -0700
Subject: markers: fix markers read barrier for multiple probes

Paul pointed out two incorrect read barriers in the marker handler code in
the path where multiple probes are connected.  Those are ordering reads of
"ptype" (single or multi probe marker), "multi" array pointer, and "multi"
array data access.

It should be ordered like this :

read ptype
smp_rmb()
read multi array pointer
smp_read_barrier_depends()
access data referenced by multi array pointer

The code with a single probe connected (optimized case, does not have to
allocate an array) has correct memory ordering.

It applies to kernel 2.6.26.x, 2.6.25.x and linux-next.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: <stable@kernel.org>		[2.6.25.x, 2.6.26.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/marker.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index 971da53..7d1faec 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -126,6 +126,11 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
 		struct marker_probe_closure *multi;
 		int i;
 		/*
+		 * Read mdata->ptype before mdata->multi.
+		 */
+		smp_rmb();
+		multi = mdata->multi;
+		/*
 		 * multi points to an array, therefore accessing the array
 		 * depends on reading multi. However, even in this case,
 		 * we must insure that the pointer is read _before_ the array
@@ -133,7 +138,6 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
 		 * in the fast path, so put the explicit barrier here.
 		 */
 		smp_read_barrier_depends();
-		multi = mdata->multi;
 		for (i = 0; multi[i].func; i++) {
 			va_start(args, call_private);
 			multi[i].func(multi[i].probe_private, call_private,
@@ -175,6 +179,11 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
 		struct marker_probe_closure *multi;
 		int i;
 		/*
+		 * Read mdata->ptype before mdata->multi.
+		 */
+		smp_rmb();
+		multi = mdata->multi;
+		/*
 		 * multi points to an array, therefore accessing the array
 		 * depends on reading multi. However, even in this case,
 		 * we must insure that the pointer is read _before_ the array
@@ -182,7 +191,6 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
 		 * in the fast path, so put the explicit barrier here.
 		 */
 		smp_read_barrier_depends();
-		multi = mdata->multi;
 		for (i = 0; multi[i].func; i++)
 			multi[i].func(multi[i].probe_private, call_private,
 				mdata->format, &args);
-- 
cgit v1.1


From 641de9d8f505db055d451b50e6e38117f84e79bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <Uwe.Kleine-Koenig@digi.com>
Date: Tue, 29 Jul 2008 22:33:38 -0700
Subject: printk: fix comment for printk ratelimiting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The comment assumed the burst to be one and the ratelimit used to be named
printk_ratelimit_jiffies.

Signed-off-by: Uwe Kleine-König <Uwe.Kleine-Koenig@digi.com>
Cc: Dave Young <hidave.darkstar@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index a7f7559..b51b156 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1309,14 +1309,14 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 
 #if defined CONFIG_PRINTK
 
-DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
 /*
  * printk rate limiting, lifted from the networking subsystem.
  *
- * This enforces a rate limit: not more than one kernel message
- * every printk_ratelimit_jiffies to make a denial-of-service
- * attack impossible.
+ * This enforces a rate limit: not more than 10 kernel messages
+ * every 5s to make a denial-of-service attack impossible.
  */
+DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
+
 int printk_ratelimit(void)
 {
 	return __ratelimit(&printk_ratelimit_state);
-- 
cgit v1.1


From 6af8bf3d86d55c98af6e453cb920ddc30867e5c7 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 29 Jul 2008 22:33:49 -0700
Subject: workqueues: add comments to __create_workqueue_key()

Dmitry Adamushko pointed out that the error handling in
__create_workqueue_key() is not clear, add the comment.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ec7e4f6..4a26a13 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -830,10 +830,21 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 		start_workqueue_thread(cwq, -1);
 	} else {
 		cpu_maps_update_begin();
+		/*
+		 * We must place this wq on list even if the code below fails.
+		 * cpu_down(cpu) can remove cpu from cpu_populated_map before
+		 * destroy_workqueue() takes the lock, in that case we leak
+		 * cwq[cpu]->thread.
+		 */
 		spin_lock(&workqueue_lock);
 		list_add(&wq->list, &workqueues);
 		spin_unlock(&workqueue_lock);
-
+		/*
+		 * We must initialize cwqs for each possible cpu even if we
+		 * are going to call destroy_workqueue() finally. Otherwise
+		 * cpu_up() can hit the uninitialized cwq once we drop the
+		 * lock.
+		 */
 		for_each_possible_cpu(cpu) {
 			cwq = init_cpu_workqueue(wq, cpu);
 			if (err || !cpu_online(cpu))
-- 
cgit v1.1


From f718cd4add5aea9d379faff92f162571e356cc5f Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Tue, 29 Jul 2008 22:33:52 -0700
Subject: sched: make scheduler sysfs attributes sysdev class devices

They are really class devices, but were incorrectly declared.  This
leads to crashes with the recent changes that makes non normal sysdevs
use a different prototype.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Pierre Ossman <drzeus-list@drzeus.cx>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0236958..21f7da9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7671,34 +7671,34 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 }
 
 #ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *page)
+static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+					   char *page)
 {
 	return sprintf(page, "%u\n", sched_mc_power_savings);
 }
-static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
-					    struct sysdev_attribute *attr,
+static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
 					    const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 0);
 }
-static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
-		   sched_mc_power_savings_store);
+static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
+			 sched_mc_power_savings_show,
+			 sched_mc_power_savings_store);
 #endif
 
 #ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *page)
+static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+					    char *page)
 {
 	return sprintf(page, "%u\n", sched_smt_power_savings);
 }
-static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
-					     struct sysdev_attribute *attr,
+static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
 					     const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 1);
 }
-static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
+		   sched_smt_power_savings_show,
 		   sched_smt_power_savings_store);
 #endif
 
-- 
cgit v1.1


From a9b60bf4c29e07a5a2f26a6f74937972fee9b58b Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Fri, 1 Aug 2008 08:39:34 -0500
Subject: kgdb: fix kgdb_validate_break_address to perform a mem write

A regression to the kgdb core was found in the case of using the
CONFIG_DEBUG_RODATA kernel option.  When this option is on, a breakpoint
cannot be written into any readonly memory page.  When an external
debugger requests a breakpoint to get set, the
kgdb_validate_break_address() was only checking to see if the address
to place the breakpoint was readable and lacked a write check.

This patch changes the validate routine to try reading (via the
breakpoint set request) and also to try immediately writing the break
point.  If either fails, an error is correctly returned and the
debugger behaves correctly.  Then an end user can make the
descision to use hardware breakpoints.

Also update the documentation to reflect that using
CONFIG_DEBUG_RODATA will inhibit the use of software breakpoints.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/kgdb.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 3ec23c3..c0d45b2 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -166,13 +166,6 @@ early_param("nokgdbroundup", opt_nokgdbroundup);
  * Weak aliases for breakpoint management,
  * can be overriden by architectures when needed:
  */
-int __weak kgdb_validate_break_address(unsigned long addr)
-{
-	char tmp_variable[BREAK_INSTR_SIZE];
-
-	return probe_kernel_read(tmp_variable, (char *)addr, BREAK_INSTR_SIZE);
-}
-
 int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
 {
 	int err;
@@ -191,6 +184,25 @@ int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
 				  (char *)bundle, BREAK_INSTR_SIZE);
 }
 
+int __weak kgdb_validate_break_address(unsigned long addr)
+{
+	char tmp_variable[BREAK_INSTR_SIZE];
+	int err;
+	/* Validate setting the breakpoint and then removing it.  In the
+	 * remove fails, the kernel needs to emit a bad message because we
+	 * are deep trouble not being able to put things back the way we
+	 * found them.
+	 */
+	err = kgdb_arch_set_breakpoint(addr, tmp_variable);
+	if (err)
+		return err;
+	err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
+	if (err)
+		printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
+		   "memory destroyed at: %lx", addr);
+	return err;
+}
+
 unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
 {
 	return instruction_pointer(regs);
-- 
cgit v1.1


From 25fc999913839a45cbb48ac7872e67f7521e7ed9 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Fri, 1 Aug 2008 08:39:35 -0500
Subject: kgdb: fix gdb serial thread queries

The command "info threads" did not work correctly with kgdb.  It would
result in a silent kernel hang if used.

This patach addresses several problems.
 - Fix use of deprecated NR_CPUS
 - Fix kgdb to not walk linearly through the pid space
 - Correctly implement shadow pids
 - Change the threads per query to a #define
 - Fix kgdb_hex2long to work with negated values

The threads 0 and -1 are reserved to represent the current task.  That
means that CPU 0 will start with a shadow thread id of -2, and CPU 1
will have a shadow thread id of -3, etc...

From the debugger you can switch to a shadow thread to see what one of
the other cpus was doing, however it is not possible to execute run
control operations on any other cpu execept the cpu executing the
kgdb_handle_exception().

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/kgdb.c | 68 +++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 50 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index c0d45b2..eaa21fc 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -56,12 +56,14 @@
 
 static int kgdb_break_asap;
 
+#define KGDB_MAX_THREAD_QUERY 17
 struct kgdb_state {
 	int			ex_vector;
 	int			signo;
 	int			err_code;
 	int			cpu;
 	int			pass_exception;
+	unsigned long		thr_query;
 	unsigned long		threadid;
 	long			kgdb_usethreadid;
 	struct pt_regs		*linux_regs;
@@ -445,9 +447,14 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
 {
 	int hex_val;
 	int num = 0;
+	int negate = 0;
 
 	*long_val = 0;
 
+	if (**ptr == '-') {
+		negate = 1;
+		(*ptr)++;
+	}
 	while (**ptr) {
 		hex_val = hex(**ptr);
 		if (hex_val < 0)
@@ -458,6 +465,9 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
 		(*ptr)++;
 	}
 
+	if (negate)
+		*long_val = -*long_val;
+
 	return num;
 }
 
@@ -527,10 +537,16 @@ static void int_to_threadref(unsigned char *id, int value)
 static struct task_struct *getthread(struct pt_regs *regs, int tid)
 {
 	/*
-	 * Non-positive TIDs are remapped idle tasks:
+	 * Non-positive TIDs are remapped to the cpu shadow information
 	 */
-	if (tid <= 0)
-		return idle_task(-tid);
+	if (tid == 0 || tid == -1)
+		tid = -atomic_read(&kgdb_active) - 2;
+	if (tid < 0) {
+		if (kgdb_info[-tid - 2].task)
+			return kgdb_info[-tid - 2].task;
+		else
+			return idle_task(-tid - 2);
+	}
 
 	/*
 	 * find_task_by_pid_ns() does not take the tasklist lock anymore
@@ -737,14 +753,15 @@ setundefined:
 }
 
 /*
- * Remap normal tasks to their real PID, idle tasks to -1 ... -NR_CPUs:
+ * Remap normal tasks to their real PID,
+ * CPU shadow threads are mapped to -CPU - 2
  */
 static inline int shadow_pid(int realpid)
 {
 	if (realpid)
 		return realpid;
 
-	return -1-raw_smp_processor_id();
+	return -raw_smp_processor_id() - 2;
 }
 
 static char gdbmsgbuf[BUFMAX + 1];
@@ -838,7 +855,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
 		local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
 	} else {
 		local_debuggerinfo = NULL;
-		for (i = 0; i < NR_CPUS; i++) {
+		for_each_online_cpu(i) {
 			/*
 			 * Try to find the task on some other
 			 * or possibly this node if we do not
@@ -972,10 +989,13 @@ static int gdb_cmd_reboot(struct kgdb_state *ks)
 /* Handle the 'q' query packets */
 static void gdb_cmd_query(struct kgdb_state *ks)
 {
-	struct task_struct *thread;
+	struct task_struct *g;
+	struct task_struct *p;
 	unsigned char thref[8];
 	char *ptr;
 	int i;
+	int cpu;
+	int finished = 0;
 
 	switch (remcom_in_buffer[1]) {
 	case 's':
@@ -985,22 +1005,34 @@ static void gdb_cmd_query(struct kgdb_state *ks)
 			break;
 		}
 
-		if (remcom_in_buffer[1] == 'f')
-			ks->threadid = 1;
-
+		i = 0;
 		remcom_out_buffer[0] = 'm';
 		ptr = remcom_out_buffer + 1;
-
-		for (i = 0; i < 17; ks->threadid++) {
-			thread = getthread(ks->linux_regs, ks->threadid);
-			if (thread) {
-				int_to_threadref(thref, ks->threadid);
+		if (remcom_in_buffer[1] == 'f') {
+			/* Each cpu is a shadow thread */
+			for_each_online_cpu(cpu) {
+				ks->thr_query = 0;
+				int_to_threadref(thref, -cpu - 2);
 				pack_threadid(ptr, thref);
 				ptr += BUF_THREAD_ID_SIZE;
 				*(ptr++) = ',';
 				i++;
 			}
 		}
+
+		do_each_thread(g, p) {
+			if (i >= ks->thr_query && !finished) {
+				int_to_threadref(thref, p->pid);
+				pack_threadid(ptr, thref);
+				ptr += BUF_THREAD_ID_SIZE;
+				*(ptr++) = ',';
+				ks->thr_query++;
+				if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
+					finished = 1;
+			}
+			i++;
+		} while_each_thread(g, p);
+
 		*(--ptr) = '\0';
 		break;
 
@@ -1023,15 +1055,15 @@ static void gdb_cmd_query(struct kgdb_state *ks)
 			error_packet(remcom_out_buffer, -EINVAL);
 			break;
 		}
-		if (ks->threadid > 0) {
+		if ((int)ks->threadid > 0) {
 			kgdb_mem2hex(getthread(ks->linux_regs,
 					ks->threadid)->comm,
 					remcom_out_buffer, 16);
 		} else {
 			static char tmpstr[23 + BUF_THREAD_ID_SIZE];
 
-			sprintf(tmpstr, "Shadow task %d for pid 0",
-					(int)(-ks->threadid-1));
+			sprintf(tmpstr, "shadowCPU%d",
+					(int)(-ks->threadid - 2));
 			kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
 		}
 		break;
-- 
cgit v1.1


From ee1d315663ee0b494898f813a266d6244b263b4f Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 7 Jul 2008 10:49:45 -0400
Subject: [PATCH] Audit: Collect signal info when SIGUSR2 is sent to auditd

Makes the kernel audit subsystem collect information about the sending
process when that process sends SIGUSR2 to the userspace audit daemon.
SIGUSR2 is a new interesting signal to auditd telling auditd that it
should try to start logging to disk again and the error condition which
caused it to stop logging to disk (usually out of space) has been
rectified.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4699950..580a538 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2375,7 +2375,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 	struct audit_context *ctx = tsk->audit_context;
 
 	if (audit_pid && t->tgid == audit_pid) {
-		if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
+		if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
 			audit_sig_pid = tsk->pid;
 			if (tsk->loginuid != -1)
 				audit_sig_uid = tsk->loginuid;
-- 
cgit v1.1


From 1d6c9649e236caa2e93e3647256216e57172b011 Mon Sep 17 00:00:00 2001
From: Vesa-Matti J Kari <vmkari@cc.helsinki.fi>
Date: Wed, 23 Jul 2008 00:06:13 +0300
Subject: kernel/audit.c control character detection is off-by-one

Hello,

According to my understanding there is an off-by-one bug in the
function:

   audit_string_contains_control()

in:

  kernel/audit.c

Patch is included.

I do not know from how many places the function is called from, but for
example, SELinux Access Vector Cache tries to log untrusted filenames via
call path:

avc_audit()
     audit_log_untrustedstring()
         audit_log_n_untrustedstring()
             audit_string_contains_control()

If audit_string_contains_control() detects control characters, then the
string is hex-encoded. But the hex=0x7f dec=127, DEL-character, is not
detected.

I guess this could have at least some minor security implications, since a
user can create a filename with 0x7f in it, causing logged filename to
possibly look different when someone reads it on the terminal.

Signed-off-by: Vesa-Matti Kari <vmkari@cc.helsinki.fi>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index e092f1c..6d90318 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1366,7 +1366,7 @@ int audit_string_contains_control(const char *string, size_t len)
 {
 	const unsigned char *p;
 	for (p = string; p < (const unsigned char *)string + len && *p; p++) {
-		if (*p == '"' || *p < 0x21 || *p > 0x7f)
+		if (*p == '"' || *p < 0x21 || *p > 0x7e)
 			return 1;
 	}
 	return 0;
-- 
cgit v1.1


From 036bbf76ad9f83781590623111b80ba0b82930ac Mon Sep 17 00:00:00 2001
From: zhangxiliang <zhangxiliang@cn.fujitsu.com>
Date: Fri, 1 Aug 2008 09:47:01 +0800
Subject: Re: [PATCH] the loginuid field should be output in all
 AUDIT_CONFIG_CHANGE audit messages

> shouldn't these be using the "audit_get_loginuid(current)"  and if we
> are going to output loginuid we also should be outputting sessionid

Thanks for your detailed explanation.
I have made a new patch for outputing "loginuid" and "sessionid" by audit_get_loginuid(current) and audit_get_sessionid(current).
If there are some deficiencies, please give me your indication.

Signed-off-by: Zhang Xiliang <zhangxiliang@cn.fujitsu.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditfilter.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 98c50cc..b7d354e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1022,8 +1022,11 @@ static void audit_update_watch(struct audit_parent *parent,
 			struct audit_buffer *ab;
 			ab = audit_log_start(NULL, GFP_KERNEL,
 				AUDIT_CONFIG_CHANGE);
+			audit_log_format(ab, "auid=%u ses=%u",
+				audit_get_loginuid(current),
+				audit_get_sessionid(current));
 			audit_log_format(ab,
-				"op=updated rules specifying path=");
+				" op=updated rules specifying path=");
 			audit_log_untrustedstring(ab, owatch->path);
 			audit_log_format(ab, " with dev=%u ino=%lu\n",
 				 dev, ino);
@@ -1058,7 +1061,10 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
 				struct audit_buffer *ab;
 				ab = audit_log_start(NULL, GFP_KERNEL,
 					AUDIT_CONFIG_CHANGE);
-				audit_log_format(ab, "op=remove rule path=");
+				audit_log_format(ab, "auid=%u ses=%u",
+					audit_get_loginuid(current),
+					audit_get_sessionid(current));
+				audit_log_format(ab, " op=remove rule path=");
 				audit_log_untrustedstring(ab, w->path);
 				if (r->filterkey) {
 					audit_log_format(ab, " key=");
-- 
cgit v1.1


From 980dfb0db340b95094732d78b55311f2c539c1af Mon Sep 17 00:00:00 2001
From: zhangxiliang <zhangxiliang@cn.fujitsu.com>
Date: Fri, 1 Aug 2008 19:15:47 +0800
Subject: [PATCH] Fix the kernel panic of audit_filter_task when key field is
 set

When calling audit_filter_task(), it calls audit_filter_rules() with audit_context is NULL.
If the key field is set, the result in audit_filter_rules() will be set to 1 and
ctx->filterkey will be set to key.
But the ctx is NULL in this condition, so kernel will panic.

Signed-off-by: Zhang Xiliang <zhangxiliang@cn.fujitsu.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 580a538..496c3dd 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -610,7 +610,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 		if (!result)
 			return 0;
 	}
-	if (rule->filterkey)
+	if (rule->filterkey && ctx)
 		ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
 	switch (rule->action) {
 	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
-- 
cgit v1.1


From 20c6aaa39ab735c7ed78e4e5a214d250efae0a6e Mon Sep 17 00:00:00 2001
From: zhangxiliang <zhangxiliang@cn.fujitsu.com>
Date: Thu, 31 Jul 2008 10:11:19 +0800
Subject: [PATCH] Fix the bug of using AUDIT_STATUS_RATE_LIMIT when set fail,
 no error output.

When the "status_get->mask" is "AUDIT_STATUS_RATE_LIMIT || AUDIT_STATUS_BACKLOG_LIMIT".
If "audit_set_rate_limit" fails and "audit_set_backlog_limit" succeeds, the "err" value
will be greater than or equal to 0. It will miss the failure of rate set.

Signed-off-by: Zhang Xiliang <zhangxiliang@cn.fujitsu.com>
Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 6d90318..4414e93 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -707,12 +707,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (status_get->mask & AUDIT_STATUS_ENABLED) {
 			err = audit_set_enabled(status_get->enabled,
 						loginuid, sessionid, sid);
-			if (err < 0) return err;
+			if (err < 0)
+				return err;
 		}
 		if (status_get->mask & AUDIT_STATUS_FAILURE) {
 			err = audit_set_failure(status_get->failure,
 						loginuid, sessionid, sid);
-			if (err < 0) return err;
+			if (err < 0)
+				return err;
 		}
 		if (status_get->mask & AUDIT_STATUS_PID) {
 			int new_pid = status_get->pid;
@@ -725,9 +727,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			audit_pid = new_pid;
 			audit_nlk_pid = NETLINK_CB(skb).pid;
 		}
-		if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
+		if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
 			err = audit_set_rate_limit(status_get->rate_limit,
 						   loginuid, sessionid, sid);
+			if (err < 0)
+				return err;
+		}
 		if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
 			err = audit_set_backlog_limit(status_get->backlog_limit,
 						      loginuid, sessionid, sid);
-- 
cgit v1.1


From 5c7edcd7ee6b77b88252fe4096dce1a46a60c829 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Thu, 31 Jul 2008 02:04:09 -0700
Subject: tracehook: fix exit_signal=0 case

My commit 2b2a1ff64afbadac842bbc58c5166962cf4f7664 introduced a regression
(sorry about that) for the odd case of exit_signal=0 (e.g. clone_flags=0).
This is not a normal use, but it's used by a case in the glibc test suite.

Dying with exit_signal=0 sends no signal, but it's supposed to wake up a
parent's blocked wait*() calls (unlike the delayed_group_leader case).
This fixes tracehook_notify_death() and its caller to distinguish a
"signal 0" wakeup from the delayed_group_leader case (with no wakeup).

Signed-off-by: Roland McGrath <roland@redhat.com>
Tested-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index eb4d647..38ec406 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -911,10 +911,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 		tsk->exit_signal = SIGCHLD;
 
 	signal = tracehook_notify_death(tsk, &cookie, group_dead);
-	if (signal > 0)
+	if (signal >= 0)
 		signal = do_notify_parent(tsk, signal);
 
-	tsk->exit_state = signal < 0 ? EXIT_DEAD : EXIT_ZOMBIE;
+	tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
 
 	/* mt-exec, de_thread() is waiting for us */
 	if (thread_group_leader(tsk) &&
@@ -927,7 +927,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	tracehook_report_death(tsk, signal, cookie, group_dead);
 
 	/* If the process is dead, release it - nobody will wait for it */
-	if (signal < 0)
+	if (signal == DEATH_REAP)
 		release_task(tsk);
 }
 
-- 
cgit v1.1


From 1a61c88defcd611bd148d6c960b498e1b8bbbe00 Mon Sep 17 00:00:00 2001
From: zhangxiliang <zhangxiliang@cn.fujitsu.com>
Date: Sat, 2 Aug 2008 10:56:37 +0800
Subject: Re: [PATCH] Fix the kernel panic of audit_filter_task when key field
 is set

Sorry, I miss a blank between if and "(".
And I add "unlikely" to check "ctx" in audit_match_perm() and audit_match_filetype().
This is a new patch for it.

Signed-off-by: Zhang Xiliang <zhangxiliang@cn.fujitsu.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 496c3dd..972f8e6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -243,6 +243,9 @@ static inline int open_arg(int flags, int mask)
 
 static int audit_match_perm(struct audit_context *ctx, int mask)
 {
+	if (unlikely(!ctx))
+		return 0;
+
 	unsigned n = ctx->major;
 	switch (audit_classify_syscall(ctx->arch, n)) {
 	case 0:	/* native */
@@ -284,6 +287,10 @@ static int audit_match_filetype(struct audit_context *ctx, int which)
 {
 	unsigned index = which & ~S_IFMT;
 	mode_t mode = which & S_IFMT;
+
+	if (unlikely(!ctx))
+		return 0;
+
 	if (index >= ctx->name_count)
 		return 0;
 	if (ctx->names[index].ino == -1)
-- 
cgit v1.1


From 725aad24c3ba96a7c06448c14c265a466cdbd663 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Sun, 3 Aug 2008 09:33:03 -0700
Subject: __sched_setscheduler: don't do any policy checks when not "user"

The "user" parameter to __sched_setscheduler indicates whether the
change is being done on behalf of a user process or not.  If not, we
shouldn't apply any permissions checks, so don't call
security_task_setscheduler().

Signed-off-by: Jeremy Fitzhardinge <jeremy@goop.org>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 21f7da9..04160d2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5004,19 +5004,21 @@ recheck:
 			return -EPERM;
 	}
 
+	if (user) {
 #ifdef CONFIG_RT_GROUP_SCHED
-	/*
-	 * Do not allow realtime tasks into groups that have no runtime
-	 * assigned.
-	 */
-	if (user
-	    && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
-		return -EPERM;
+		/*
+		 * Do not allow realtime tasks into groups that have no runtime
+		 * assigned.
+		 */
+		if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+			return -EPERM;
 #endif
 
-	retval = security_task_setscheduler(p, policy, param);
-	if (retval)
-		return retval;
+		retval = security_task_setscheduler(p, policy, param);
+		if (retval)
+			return retval;
+	}
+
 	/*
 	 * make sure no PI-waiters arrive (or leave) while we are
 	 * changing the priority of the task:
-- 
cgit v1.1


From 32194450330be327f3b25bf6b66298bd122599e9 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 5 Aug 2008 13:01:10 -0700
Subject: relay: fix "full buffer with exactly full last subbuffer" accounting
 problem

In relay's current read implementation, if the buffer is completely full
but hasn't triggered the buffer-full condition (i.e. the last write
didn't cross the subbuffer boundary) and the last subbuffer is exactly
full, the subbuffer accounting code erroneously finds nothing available.
This patch fixes the problem.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Mathieu Desnoyers <compudj@krystal.dyndns.org>
Cc: Andrea Righi <righi.andrea@gmail.com>
Cc: <stable@kernel.org>		[2.6.25.x, 2.6.26.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/relay.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 04006ef..8d13a78 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -944,6 +944,10 @@ static void relay_file_read_consume(struct rchan_buf *buf,
 	size_t n_subbufs = buf->chan->n_subbufs;
 	size_t read_subbuf;
 
+	if (buf->subbufs_produced == buf->subbufs_consumed &&
+	    buf->offset == buf->bytes_consumed)
+		return;
+
 	if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
 		relay_subbufs_consumed(buf->chan, buf->cpu, 1);
 		buf->bytes_consumed = 0;
@@ -975,6 +979,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
 
 	relay_file_read_consume(buf, read_pos, 0);
 
+	consumed = buf->subbufs_consumed;
+
 	if (unlikely(buf->offset > subbuf_size)) {
 		if (produced == consumed)
 			return 0;
@@ -993,8 +999,12 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
 	if (consumed > produced)
 		produced += n_subbufs * subbuf_size;
 
-	if (consumed == produced)
+	if (consumed == produced) {
+		if (buf->offset == subbuf_size &&
+		    buf->subbufs_produced > buf->subbufs_consumed)
+			return 1;
 		return 0;
+	}
 
 	return 1;
 }
-- 
cgit v1.1


From 5b2becc8cffdccdd60c63099f592ddd35aa6c34f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 5 Aug 2008 13:01:13 -0700
Subject: semaphore: __down_common: use signal_pending_state()

Change __down_common() to use signal_pending_state() instead of open
coding.

The changes in kernel/semaphore.o are just artifacts, the state checks are
optimized away.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/semaphore.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index aaaeae8..94a62c0 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -212,9 +212,7 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
 	waiter.up = 0;
 
 	for (;;) {
-		if (state == TASK_INTERRUPTIBLE && signal_pending(task))
-			goto interrupted;
-		if (state == TASK_KILLABLE && fatal_signal_pending(task))
+		if (signal_pending_state(state, task))
 			goto interrupted;
 		if (timeout <= 0)
 			goto timed_out;
-- 
cgit v1.1


From c69ad71bcdecbaab82cfacb1dc967bd7fd967a3b Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Tue, 5 Aug 2008 13:01:14 -0700
Subject: genirq: better warning on irqchip->set_type() failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While I'm glad to finally see the hole fixed whereby passing an invalid
IRQ trigger type to request_irq() would be ignored, the current diagnostic
isn't quite useful.  Fixed by also listing the trigger type which was
rejected.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Uwe Kleine-König <Uwe.Kleine-Koenig@digi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/manage.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 152abfd..0314074 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -323,7 +323,8 @@ static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq,
 	ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK);
 
 	if (ret)
-		pr_err("setting flow type for irq %u failed (%pF)\n",
+		pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
+				(int)(flags & IRQF_TRIGGER_MASK),
 				irq, chip->set_type);
 
 	return ret;
-- 
cgit v1.1


From d2dc1f4adb4b5b02d87e49e115e5107f4da790c0 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Tue, 5 Aug 2008 13:01:31 -0700
Subject: dma: fix order calculation in dma_mark_declared_memory_occupied()

get_order() takes byte-sized input, not a page-granular one.

Irrespective of this fix I'm inclined to believe that this doesn't work
right anyway - bitmap_allocate_region() has an implicit assumption of
'pos' being suitable for 'order', which this function doesn't seem to
enforce (and since it's being called with a byte-granular value there's no
reason to believe that the callers would make sure device_addr is passed
accordingly - it's also not documented that way).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Dmitry Baryshkov <dbaryshkov@gmail.com>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/dma-coherent.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index 7517115..91e9695 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -77,15 +77,14 @@ void *dma_mark_declared_memory_occupied(struct device *dev,
 {
 	struct dma_coherent_mem *mem = dev->dma_mem;
 	int pos, err;
-	int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
 
-	pages >>= PAGE_SHIFT;
+	size += device_addr & ~PAGE_MASK;
 
 	if (!mem)
 		return ERR_PTR(-EINVAL);
 
 	pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
-	err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
+	err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
 	if (err != 0)
 		return ERR_PTR(err);
 	return mem->virt_base + (pos << PAGE_SHIFT);
-- 
cgit v1.1


From bf1db69fbf4ff511e88736ce2e6318846f34492b Mon Sep 17 00:00:00 2001
From: Richard Hughes <richard@hughsie.com>
Date: Tue, 5 Aug 2008 13:01:35 -0700
Subject: pm_qos: spelling fixes

A documentation cleanup patch.  With a minor tweak to clarify units for
kbs.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: mark gross <mgross@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pm_qos_params.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 8cb7570..da9c2dd 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -24,7 +24,7 @@
  * requirement that the application has is cleaned up when closes the file
  * pointer or exits the pm_qos_object will get an opportunity to clean up.
  *
- * mark gross mgross@linux.intel.com
+ * Mark Gross <mgross@linux.intel.com>
  */
 
 #include <linux/pm_qos_params.h>
@@ -211,8 +211,8 @@ EXPORT_SYMBOL_GPL(pm_qos_requirement);
  * @value: defines the qos request
  *
  * This function inserts a new entry in the pm_qos_class list of requested qos
- * performance charactoistics.  It recomputes the agregate QoS expectations for
- * the pm_qos_class of parrameters.
+ * performance characteristics.  It recomputes the aggregate QoS expectations
+ * for the pm_qos_class of parameters.
  */
 int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value)
 {
@@ -250,10 +250,10 @@ EXPORT_SYMBOL_GPL(pm_qos_add_requirement);
  * @name: identifies the request
  * @value: defines the qos request
  *
- * Updates an existing qos requierement for the pm_qos_class of parameters along
+ * Updates an existing qos requirement for the pm_qos_class of parameters along
  * with updating the target pm_qos_class value.
  *
- * If the named request isn't in the lest then no change is made.
+ * If the named request isn't in the list then no change is made.
  */
 int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value)
 {
@@ -287,7 +287,7 @@ EXPORT_SYMBOL_GPL(pm_qos_update_requirement);
  * @pm_qos_class: identifies which list of qos request to us
  * @name: identifies the request
  *
- * Will remove named qos request from pm_qos_class list of parrameters and
+ * Will remove named qos request from pm_qos_class list of parameters and
  * recompute the current target value for the pm_qos_class.
  */
 void pm_qos_remove_requirement(int pm_qos_class, char *name)
@@ -319,7 +319,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
  * @notifier: notifier block managed by caller.
  *
  * will register the notifier into a notification chain that gets called
- * uppon changes to the pm_qos_class target value.
+ * upon changes to the pm_qos_class target value.
  */
  int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
 {
@@ -338,7 +338,7 @@ EXPORT_SYMBOL_GPL(pm_qos_add_notifier);
  * @notifier: notifier block to be removed.
  *
  * will remove the notifier from the notification chain that gets called
- * uppon changes to the pm_qos_class target value.
+ * upon changes to the pm_qos_class target value.
  */
 int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
 {
-- 
cgit v1.1