diff --git a/src/pkg/runtime/proc.c b/src/pkg/runtime/proc.c
index 6416651ce5..13bc52bb68 100644
--- a/src/pkg/runtime/proc.c
+++ b/src/pkg/runtime/proc.c
@@ -773,7 +773,7 @@ runtime·gosched(void)
 void
 runtime·entersyscall(void)
 {
-	uint32 v, w;
+	uint32 v;
 
 	if(runtime·sched.predawn)
 		return;
@@ -796,24 +796,14 @@ runtime·entersyscall(void)
 	//	mcpu--
 	//	gwait not true
 	//	waitstop && mcpu <= mcpumax not true
-	// If we can do the same with a single atomic read/write,
+	// If we can do the same with a single atomic add,
 	// then we can skip the locks.
-	for(;;) {
-		v = runtime·sched.atomic;
-		if(atomic_gwaiting(v))
-			break;
-		if(atomic_waitstop(v) && atomic_mcpu(v)-1 <= atomic_mcpumax(v))
-			break;
-		w = v;
-		w += (-1<<mcpuShift);
-		if(runtime·cas(&runtime·sched.atomic, v, w))
-			return;
-	}
+	v = runtime·xadd(&runtime·sched.atomic, -1<<mcpuShift);
+	if(!atomic_gwaiting(v) && (!atomic_waitstop(v) || atomic_mcpu(v) > atomic_mcpumax(v)))
+		return;
 
 	schedlock();
-
-	// atomic { mcpu--; }
-	v = runtime·xadd(&runtime·sched.atomic, (-1<<mcpuShift));
+	v = runtime·atomicload(&runtime·sched.atomic);
 	if(atomic_gwaiting(v)) {
 		matchmg();
 		v = runtime·atomicload(&runtime·sched.atomic);
@@ -837,43 +827,28 @@ runtime·entersyscall(void)
 void
 runtime·exitsyscall(void)
 {
-	uint32 v, w;
+	uint32 v;
 
 	if(runtime·sched.predawn)
 		return;
 
 	// Fast path.
-	// If we can do the mcpu-- bookkeeping and
+	// If we can do the mcpu++ bookkeeping and
 	// find that we still have mcpu <= mcpumax, then we can
 	// start executing Go code immediately, without having to
 	// schedlock/schedunlock.
-	for(;;) {
-		// If the profiler frequency needs updating,
-		// take the slow path.
-		if(m->profilehz != runtime·sched.profilehz)
-			break;
-
-		v = runtime·sched.atomic;
-		if(atomic_mcpu(v) >= atomic_mcpumax(v))
-			break;
-
-		w = v;
-		w += (1<<mcpuShift);
-		if(runtime·cas(&runtime·sched.atomic, v, w)) {
-			// There's a cpu for us, so we can run.
-			g->status = Grunning;
-			// Garbage collector isn't running (since we are),
-			// so okay to clear gcstack.
-			g->gcstack = nil;
-			return;
-		}
+	v = runtime·xadd(&runtime·sched.atomic, (1<<mcpuShift));
+	if(m->profilehz == runtime·sched.profilehz && atomic_mcpu(v) <= atomic_mcpumax(v)) {
+		// There's a cpu for us, so we can run.
+		g->status = Grunning;
+		// Garbage collector isn't running (since we are),
+		// so okay to clear gcstack.
+		g->gcstack = nil;
+		return;
 	}
 
 	schedlock();
 
-	// atomic { mcpu++; }
-	runtime·xadd(&runtime·sched.atomic, (1<<mcpuShift));
-
 	// Tell scheduler to put g back on the run queue:
 	// mostly equivalent to g->status = Grunning,
 	// but keeps the garbage collector from thinking
diff --git a/src/pkg/runtime/proc.p b/src/pkg/runtime/proc.p
index 337b078773..f0b46de611 100644
--- a/src/pkg/runtime/proc.p
+++ b/src/pkg/runtime/proc.p
@@ -3,8 +3,9 @@
 // license that can be found in the LICENSE file.
 
 /*
-model for proc.c as of 2011/07/15.
-takes 4300 seconds to explore 1128130 states
+model for proc.c as of 2011/07/22.
+
+takes 4900 seconds to explore 1189070 states
 with G=3, var_gomaxprocs=1
 on a Core i7 L640 2.13 GHz Lenovo X201s.
 
@@ -329,33 +330,53 @@ inline schedule() {
 	nextgandunlock()
 }
 
+/*
+ * schedpend is > 0 if a goroutine is about to committed to
+ * entering the scheduler but has not yet done so.
+ * Just as we don't test for the undesirable conditions when a
+ * goroutine is in the scheduler, we don't test for them when
+ * a goroutine will be in the scheduler shortly.
+ * Modeling this state lets us replace mcpu cas loops with
+ * simpler mcpu atomic adds.
+ */
+byte schedpend;
+
 /*
  * entersyscall is like the C function.
  */
 inline entersyscall() {
+	bit willsched;
+
 	/*
 	 * Fast path.  Check all the conditions tested during schedlock/schedunlock
 	 * below, and if we can get through the whole thing without stopping, run it
 	 * in one atomic cas-based step.
 	 */
 	atomic {
+		atomic_mcpu--;
 		if
 		:: atomic_gwaiting ->
 			skip
-		:: atomic_waitstop && atomic_mcpu-1 <= atomic_mcpumax ->
+		:: atomic_waitstop && atomic_mcpu <= atomic_mcpumax ->
 			skip
 		:: else ->
-			atomic_mcpu--;
 			goto Lreturn_entersyscall;
-		fi
+		fi;
+		willsched = 1;
+		schedpend++;
 	}
 
 	/*
 	 * Normal path.
 	 */
 	schedlock()
-	d_step {
-		atomic_mcpu--;
+	opt_dstep {
+		if
+		:: willsched ->
+			schedpend--;
+			willsched = 0
+		:: else
+		fi
 	}
 	if
 	:: atomic_gwaiting ->
@@ -382,11 +403,11 @@ inline exitsyscall() {
 	 */
 	atomic {
 		// omitted profilehz check
+		atomic_mcpu++;
 		if
 		:: atomic_mcpu >= atomic_mcpumax ->
 			skip
 		:: else ->
-			atomic_mcpu++;
 			goto Lreturn_exitsyscall
 		fi
 	}
@@ -396,7 +417,6 @@ inline exitsyscall() {
 	 */
 	schedlock();
 	d_step {
-		atomic_mcpu++;
 		if
 		:: atomic_mcpu <= atomic_mcpumax ->
 			skip
@@ -497,10 +517,10 @@ active proctype monitor() {
 
 	do
 	// Should never have goroutines waiting with procs available.
-	:: !sched_lock && gwait > 0 && atomic_mcpu < atomic_mcpumax ->
+	:: !sched_lock && schedpend==0 && gwait > 0 && atomic_mcpu < atomic_mcpumax ->
 		assert 0
 	// Should never have gc waiting for stop if things have already stopped.
-	:: !sched_lock && atomic_waitstop && atomic_mcpu <= atomic_mcpumax ->
+	:: !sched_lock && schedpend==0 && atomic_waitstop && atomic_mcpu <= atomic_mcpumax ->
 		assert 0
 	od
 }