Re: cdrecord deadlocks linux 2.6.8.1 (problem in setscheduler)

From: John Johansen (johansen@private)
Date: Wed Oct 20 2004 - 11:41:13 PDT


On Wed, Oct 20, 2004 at 09:07:06AM -0700, Chris Wright wrote:
> * John Johansen (johansen@private) wrote:
> > sigh it seems I posted the wrong patch file the one that got posted is
> > the one where I threw up my hands and swore about having to break the
> > lock as Stephan suggests.
> > 
> > The actual patch adds an extra lock (I liked that a little better than
> > breaking the rq lock) in setscheduler and around the other points that
> > call __setschedule.
> 
> Hmm, I didn't see this patch, but it's sounding like it won't fly.
> 
ya sorry, I ran out of time this morning and had to run to the accountant.

I think you are right that it likely won't fly, and that looping in the
unlikely case of policy change is probabley better.

jj

--- kernel/sched.c.JJ	2004-10-20 09:22:07.000000000 -0700
+++ kernel/sched.c	2004-10-20 09:55:54.000000000 -0700
@@ -466,6 +466,8 @@
 	spin_unlock_irqrestore(&rq->lock, *flags);
 }
 
+static spinlock_t policy_lock = SPIN_LOCK_UNLOCKED;
+
 #ifdef CONFIG_SCHEDSTATS
 /*
  * bump this up when changing the output format or the meaning of an existing
@@ -3186,7 +3188,7 @@
 	int retval = -EINVAL;
 	int oldprio;
 	prio_array_t *array;
-	unsigned long flags;
+	unsigned long flags, pflags;
 	runqueue_t *rq;
 	task_t *p;
 
@@ -3209,10 +3211,11 @@
 		goto out_unlock_tasklist;
 
 	/*
-	 * To be able to change p->policy safely, the apropriate
-	 * runqueue lock must be held.
+	 * To be able to change p->policy safely, the policy_lock
+	 * must be held during mediation of privledge and then
+	 * appropriate runqueue lock must be held when setting.
 	 */
-	rq = task_rq_lock(p, &flags);
+	spin_lock_irqsave(&policy_lock, pflags);
 
 	if (policy < 0)
 		policy = p->policy;
@@ -3220,7 +3223,7 @@
 		retval = -EINVAL;
 		if (policy != SCHED_FIFO && policy != SCHED_RR &&
 				policy != SCHED_NORMAL)
-			goto out_unlock;
+			goto out_unlock_policy;
 	}
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
@@ -3230,21 +3233,30 @@
 	 */
 	retval = -EINVAL;
 	if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
-		goto out_unlock;
+		goto out_unlock_policy;
 	if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0))
-		goto out_unlock;
+		goto out_unlock_policy;
 
 	retval = -EPERM;
 	if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
 	    !capable(CAP_SYS_NICE))
-		goto out_unlock;
+		goto out_unlock_policy;
 	if ((current->euid != p->euid) && (current->euid != p->uid) &&
 	    !capable(CAP_SYS_NICE))
-		goto out_unlock;
+		goto out_unlock_policy;
 
 	retval = security_task_setscheduler(p, policy, &lp);
 	if (retval)
-		goto out_unlock;
+		goto out_unlock_policy;
+
+	rq = task_rq_lock(p, &flags);
+
+	/* 
+	 * we don't need to hold the policy lock now that the rq_lock is held
+	 * the flags and pflags are reversed because of how the locks
+	 * are being released
+	 */
+	spin_unlock_irqrestore(&policy_lock, flags);
 
 	array = p->array;
 	if (array)
@@ -3266,13 +3278,16 @@
 			resched_task(rq->curr);
 	}
 
-out_unlock:
-	task_rq_unlock(rq, &flags);
+	task_rq_unlock(rq, &pflags);
 out_unlock_tasklist:
 	read_unlock_irq(&tasklist_lock);
 
 out_nounlock:
 	return retval;
+
+out_unlock_policy:
+	spin_unlock_irqrestore(&policy_lock, pflags);
+	goto out_unlock_tasklist;
 }
 
 /**
@@ -4047,7 +4062,7 @@
 	int cpu = smp_processor_id();
 	runqueue_t *rq = this_rq();
 	struct task_struct *p = rq->idle;
-	unsigned long flags;
+	unsigned long flags, pflags;
 
 	/* cpu has to be offline */
 	BUG_ON(cpu_online(cpu));
@@ -4055,13 +4070,17 @@
 	/* Strictly not necessary since rest of the CPUs are stopped by now
 	 * and interrupts disabled on current cpu.
 	 */
+	spin_lock_irqsave(&policy_lock, pflags);
 	spin_lock_irqsave(&rq->lock, flags);
 
+	/* flag and pflag are switch because of lock release order */
+	spin_unlock_irqrestore(&policy_lock, flags);
+
 	__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
 	/* Add idle task to _front_ of it's priority queue */
 	__activate_idle_task(p, rq);
 
-	spin_unlock_irqrestore(&rq->lock, flags);
+	spin_unlock_irqrestore(&rq->lock, pflags);
 }
 
 static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
@@ -4116,7 +4135,7 @@
 	int cpu = (long)hcpu;
 	struct task_struct *p;
 	struct runqueue *rq;
-	unsigned long flags;
+	unsigned long flags, pflags;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
@@ -4126,9 +4145,12 @@
 		p->flags |= PF_NOFREEZE;
 		kthread_bind(p, cpu);
 		/* Must be high prio: stop_machine expects to yield to it. */
+		spin_lock_irqsave(&policy_lock, pflags);
 		rq = task_rq_lock(p, &flags);
+		/* flag and pflag are switch because of lock release order */
+		spin_unlock_irqrestore(&policy_lock, flags);
 		__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
-		task_rq_unlock(rq, &flags);
+		task_rq_unlock(rq, &pflags);
 		cpu_rq(cpu)->migration_thread = p;
 		break;
 	case CPU_ONLINE:
@@ -4148,12 +4170,15 @@
 		kthread_stop(rq->migration_thread);
 		rq->migration_thread = NULL;
 		/* Idle task back to normal (off runqueue, low prio) */
+		spin_lock_irqsave(&policy_lock, pflags);
 		rq = task_rq_lock(rq->idle, &flags);
+		/* flag and pflag are switch because of lock release order */
+		spin_unlock_irqrestore(&policy_lock, flags);
 		deactivate_task(rq->idle, rq);
 		rq->idle->static_prio = MAX_PRIO;
 		__setscheduler(rq->idle, SCHED_NORMAL, 0);
 		migrate_dead_tasks(cpu);
-		task_rq_unlock(rq, &flags);
+		task_rq_unlock(rq, &pflags);
 		BUG_ON(rq->nr_running != 0);
 
 		/* No need to migrate the tasks: it was best-effort if



This archive was generated by hypermail 2.1.3 : Wed Oct 20 2004 - 11:55:15 PDT