[dev.fuzz] internal/fuzz: fix deadlock with multiple workers

CoordinateFuzzing now continues to run after discovering a crasher. It waits until all workers have terminated before returning. This fixes a deadlock that occurred when multiple workers discovered crashers concurrently. CoordinateFuzzing would receive one crasher, close doneC (telling workers to stop), then wait for workers to stop without receiving more crashers. Other workers would block sending crashers. Change-Id: I55a64aac0e6e43f5e36b9d03c15051c3d5debb20 Reviewed-on: https://go-review.googlesource.com/c/go/+/293369 Trust: Jay Conrod <jayconrod@google.com> Trust: Katie Hockman <katie@golang.org> Run-TryBot: Jay Conrod <jayconrod@google.com> Reviewed-by: Katie Hockman <katie@golang.org>
2021-02-17 17:31:22 -05:00 · 2021-02-17 17:31:22 -05:00 · 6ee1506769
parent d4825819fe
commit 6ee1506769
8 changed files with 60 additions and 67 deletions
--- a/src/cmd/go/testdata/script/test_fuzz.txt
+++ b/src/cmd/go/testdata/script/test_fuzz.txt
@ -9,12 +9,12 @@ stdout FAIL

 # Test that fuzzing a fuzz target that returns without failing or calling
 # f.Fuzz fails and causes a non-zero exit status.
-! go test -fuzz=Fuzz -fuzztime=5s -parallel=1 noop_fuzz_test.go
+! go test -fuzz=Fuzz -fuzztime=5s noop_fuzz_test.go
 ! stdout ^ok
 stdout FAIL

 # Test that calling f.Error in a fuzz target causes a non-zero exit status.
-! go test -fuzz=Fuzz -fuzztime=5s -parallel=1 error_fuzz_test.go
+! go test -fuzz=Fuzz -fuzztime=5s error_fuzz_test.go
 ! stdout ^ok
 stdout FAIL

@ -29,12 +29,12 @@ stdout ^ok
 ! stdout FAIL

 # Test that successful fuzzing exits cleanly.
-go test -fuzz=Fuzz -fuzztime=5s -parallel=1 success_fuzz_test.go
+go test -fuzz=Fuzz -fuzztime=5s success_fuzz_test.go
 stdout ok
 ! stdout FAIL

 # Test that calling f.Fatal while fuzzing causes a non-zero exit status.
-! go test -fuzz=Fuzz -fuzztime=5s -parallel=1 fatal_fuzz_test.go
+! go test -fuzz=Fuzz -fuzztime=5s fatal_fuzz_test.go
 ! stdout ^ok
 stdout FAIL

--- a/src/cmd/go/testdata/script/test_fuzz_cache.txt
+++ b/src/cmd/go/testdata/script/test_fuzz_cache.txt
@ -10,7 +10,7 @@ exists $GOCACHE
 ! exists $GOCACHE/fuzz

 # Fuzzing should write interesting values to the cache.
-go test -fuzz=FuzzY -fuzztime=5s -parallel=1 .
+go test -fuzz=FuzzY -fuzztime=5s .
 go run ./contains_files $GOCACHE/fuzz/example.com/y/FuzzY

 # 'go clean -cache' should not delete the fuzz cache.
--- a/src/cmd/go/testdata/script/test_fuzz_fuzztime.txt
+++ b/src/cmd/go/testdata/script/test_fuzz_fuzztime.txt
@ -7,7 +7,7 @@
 go test

 # Fuzzing should exit 0 when after fuzztime, even if timeout is short.
-go test -timeout=10ms -fuzz=FuzzFast -fuzztime=5s -parallel=1
+go test -timeout=10ms -fuzz=FuzzFast -fuzztime=5s

 # We should see the same behavior when invoking the test binary directly.
 go test -c
--- a/src/cmd/go/testdata/script/test_fuzz_match.txt
+++ b/src/cmd/go/testdata/script/test_fuzz_match.txt
@ -7,12 +7,12 @@ go test standalone_fuzz_test.go
 stdout '^ok'

 # Matches only for fuzzing.
-go test -fuzz Fuzz -fuzztime 5s -parallel 1 standalone_fuzz_test.go
+go test -fuzz Fuzz -fuzztime 2s -parallel 4 standalone_fuzz_test.go
 ! stdout '^ok.*\[no tests to run\]'
 stdout '^ok'

 # Matches none for fuzzing but will run the fuzz target as a test.
-go test -fuzz ThisWillNotMatch -fuzztime 5s -parallel 1 standalone_fuzz_test.go
+go test -fuzz ThisWillNotMatch -fuzztime 2s -parallel 4 standalone_fuzz_test.go
 ! stdout '^ok.*\[no tests to run\]'
 stdout '^ok'
 stdout '\[no targets to fuzz\]'
@ -30,7 +30,7 @@ stdout '^ok.*\[no tests to run\]'
 ! stdout '\[no targets to fuzz\]'

 # Matches more than one fuzz target for fuzzing.
-go test -fuzz Fuzz -fuzztime 5s -parallel 1 multiple_fuzz_test.go
+go test -fuzz Fuzz -fuzztime 2s -parallel 4 multiple_fuzz_test.go
 # The tests should run, but not be fuzzed
 ! stdout '\[no tests to run\]'
 ! stdout '\[no targets to fuzz\]'
--- a/src/cmd/go/testdata/script/test_fuzz_mutate_crash.txt
+++ b/src/cmd/go/testdata/script/test_fuzz_mutate_crash.txt
@ -8,50 +8,48 @@

 [short] skip

-# TODO: remove -parallel=1 once the races are fixed.
-
 # Running the seed corpus for all of the targets should pass the first
 # time, since nothing in the seed corpus will cause a crash.
-go test -parallel=1
+go test

 # Running the fuzzer should find a crashing input quickly.
-! go test -fuzz=FuzzWithBug -fuzztime=5s -parallel=1
+! go test -fuzz=FuzzWithBug -fuzztime=5s
 stdout 'testdata[/\\]corpus[/\\]FuzzWithBug[/\\]'
 stdout 'this input caused a crash!'
 go run check_testdata.go FuzzWithBug

 # Now, the failing bytes should have been added to the seed corpus for
 # the target, and should fail when run without fuzzing.
-! go test -parallel=1
+! go test

 # Running the fuzzer should find a crashing input quickly for fuzzing two types.
-! go test -run=FuzzWithTwoTypes -fuzz=FuzzWithTwoTypes -fuzztime=5s -parallel=1
+! go test -run=FuzzWithTwoTypes -fuzz=FuzzWithTwoTypes -fuzztime=5s
 stdout 'testdata[/\\]corpus[/\\]FuzzWithTwoTypes[/\\]'
 stdout 'these inputs caused a crash!'
 go run check_testdata.go FuzzWithTwoTypes

-! go test -run=FuzzWithNilPanic -fuzz=FuzzWithNilPanic -fuzztime=5s -parallel=1
+! go test -run=FuzzWithNilPanic -fuzz=FuzzWithNilPanic -fuzztime=5s
 stdout 'testdata[/\\]corpus[/\\]FuzzWithNilPanic[/\\]'
 stdout 'runtime.Goexit'
 go run check_testdata.go FuzzWithNilPanic

-! go test -run=FuzzWithFail -fuzz=FuzzWithFail -fuzztime=5s -parallel=1
+! go test -run=FuzzWithFail -fuzz=FuzzWithFail -fuzztime=5s
 stdout 'testdata[/\\]corpus[/\\]FuzzWithFail[/\\]'
 go run check_testdata.go FuzzWithFail

-! go test -run=FuzzWithErrorf -fuzz=FuzzWithErrorf -fuzztime=5s -parallel=1
+! go test -run=FuzzWithErrorf -fuzz=FuzzWithErrorf -fuzztime=5s
 stdout 'testdata[/\\]corpus[/\\]FuzzWithErrorf[/\\]'
 # TODO: Uncomment this part of the test once it's fixed
 # stdout 'errorf was called here'
 go run check_testdata.go FuzzWithErrorf

-! go test -run=FuzzWithFatalf -fuzz=FuzzWithFatalf -fuzztime=5s -parallel=1
+! go test -run=FuzzWithFatalf -fuzz=FuzzWithFatalf -fuzztime=5s
 stdout 'testdata[/\\]corpus[/\\]FuzzWithFatalf[/\\]'
 # TODO: Uncomment this part of the test once it's fixed
 # stdout 'fatalf was called here'
 go run check_testdata.go FuzzWithFatalf

-! go test -run=FuzzWithBadExit -fuzz=FuzzWithBadExit -fuzztime=5s -parallel=1
+! go test -run=FuzzWithBadExit -fuzz=FuzzWithBadExit -fuzztime=5s
 stdout 'testdata[/\\]corpus[/\\]FuzzWithBadExit[/\\]'
 stdout 'unexpectedly'
 go run check_testdata.go FuzzWithBadExit
@ -154,8 +152,8 @@ func main() {
 		os.Exit(1)
 	}

-	if len(files) != 1 {
-		fmt.Fprintln(os.Stderr, fmt.Errorf("expect only one new mutation to be written to testdata", len(files)))
+	if len(files) == 0 {
+		fmt.Fprintf(os.Stderr, "expect at least one new mutation to be written to testdata\n")
 		os.Exit(1)
 	}

@ -166,13 +164,13 @@ func main() {
 		os.Exit(1)
 	}
 	if bytes.Equal(contents, []byte("aa")) {
-		fmt.Fprintln(os.Stderr, fmt.Errorf("newly written testdata entry was not mutated"))
+		fmt.Fprintf(os.Stderr, "newly written testdata entry was not mutated\n")
 		os.Exit(1)
 	}
 	// The hash of the bytes in the file should match the filename.
 	h := []byte(fmt.Sprintf("%x", sha256.Sum256(contents)))
 	if !bytes.Equal([]byte(fname), h) {
-		fmt.Fprintln(os.Stderr, fmt.Errorf("hash of bytes %q does not match filename %q", h, fname))
+		fmt.Fprintf(os.Stderr, "hash of bytes %q does not match filename %q\n", h, fname)
 		os.Exit(1)
 	}
-}
+}
--- a/src/internal/fuzz/fuzz.go
+++ b/src/internal/fuzz/fuzz.go
@ -18,7 +18,6 @@ import (
 	"reflect"
 	"runtime"
 	"strings"
-	"sync"
 )

 // CoordinateFuzzing creates several worker processes and communicates with
@ -82,8 +81,8 @@ func CoordinateFuzzing(ctx context.Context, parallel int, seed []CorpusEntry, ty
 		inputC:       make(chan CorpusEntry),
 		interestingC: make(chan CorpusEntry),
 		crasherC:     make(chan crasherEntry),
-		errC:         make(chan error),
 	}
+	errC := make(chan error)

 	newWorker := func() (*worker, error) {
 		mem, err := sharedMemTempFile(sharedMemSize)
@ -102,6 +101,19 @@ func CoordinateFuzzing(ctx context.Context, parallel int, seed []CorpusEntry, ty
 		}, nil
 	}

+	var fuzzErr error
+	stopping := false
+	stop := func(err error) {
+		if fuzzErr == nil || fuzzErr == ctx.Err() {
+			fuzzErr = err
+		}
+		if stopping {
+			return
+		}
+		stopping = true
+		close(c.doneC)
+	}
+
 	// Start workers.
 	workers := make([]*worker, parallel)
 	for i := range workers {
@ -111,38 +123,22 @@ func CoordinateFuzzing(ctx context.Context, parallel int, seed []CorpusEntry, ty
 			return err
 		}
 	}
-
-	workerErrs := make([]error, len(workers))
-	var wg sync.WaitGroup
-	wg.Add(len(workers))
 	for i := range workers {
-		go func(i int) {
-			defer wg.Done()
-			workerErrs[i] = workers[i].runFuzzing()
-			if cleanErr := workers[i].cleanup(); workerErrs[i] == nil {
-				workerErrs[i] = cleanErr
+		w := workers[i]
+		go func() {
+			err := w.runFuzzing()
+			cleanErr := w.cleanup()
+			if err == nil {
+				err = cleanErr
 			}
-		}(i)
+			errC <- err
+		}()
 	}

-	// Before returning, signal workers to stop, wait for them to actually stop,
-	// and gather any errors they encountered.
-	defer func() {
-		close(c.doneC)
-		wg.Wait()
-		if err == nil || err == ctx.Err() {
-			for _, werr := range workerErrs {
-				if werr != nil {
-					// Return the first error found, replacing ctx.Err() if a more
-					// interesting error is found.
-					err = werr
-					break
-				}
-			}
-		}
-	}()
-
 	// Main event loop.
+	// Do not return until all workers have terminated. We avoid a deadlock by
+	// receiving messages from workers even after closing c.doneC.
+	activeWorkers := len(workers)
 	i := 0
 	for {
 		select {
@ -152,7 +148,7 @@ func CoordinateFuzzing(ctx context.Context, parallel int, seed []CorpusEntry, ty
 			// not the coordinator or worker processes. 'go test' will stop running
 			// actions, but it won't interrupt its child processes. This makes it
 			// difficult to stop fuzzing on Windows without a timeout.
-			return ctx.Err()
+			stop(ctx.Err())

 		case crasher := <-c.crasherC:
 			// A worker found a crasher. Write it to testdata and return it.
@ -165,7 +161,7 @@ func CoordinateFuzzing(ctx context.Context, parallel int, seed []CorpusEntry, ty
 			}
 			// TODO(jayconrod,katiehockman): if -keepfuzzing, report the error to
 			// the user and restart the crashed worker.
-			return err
+			stop(err)

 		case entry := <-c.interestingC:
 			// Some interesting input arrived from a worker.
@ -179,13 +175,17 @@ func CoordinateFuzzing(ctx context.Context, parallel int, seed []CorpusEntry, ty
 			corpus.entries = append(corpus.entries, entry)
 			if cacheDir != "" {
 				if _, err := writeToCorpus(entry.Data, cacheDir); err != nil {
-					return err
+					stop(err)
 				}
 			}

-		case err := <-c.errC:
-			// A worker encountered a fatal error.
-			return err
+		case err := <-errC:
+			// A worker terminated, possibly after encountering a fatal error.
+			stop(err)
+			activeWorkers--
+			if activeWorkers == 0 {
+				return fuzzErr
+			}

 		case c.inputC <- corpus.entries[i]:
 			// Send the next input to any worker.
@ -268,10 +268,6 @@ type coordinator struct {
 	// should be saved in the corpus, and we may want to stop fuzzing after
 	// receiving one.
 	crasherC chan crasherEntry
-
-	// errC is sent internal errors encountered by workers. When the coordinator
-	// receives an error, it closes doneC and returns.
-	errC chan error
 }

 // readCache creates a combined corpus from seed values and values in the cache
--- a/src/internal/fuzz/sys_posix.go
+++ b/src/internal/fuzz/sys_posix.go
@ -88,5 +88,5 @@ func isInterruptError(err error) bool {
 		return false
 	}
 	status := exitErr.Sys().(syscall.WaitStatus)
-	return status.Signal() == syscall.SIGINT
+	return status.Signal() == syscall.SIGINT || status.Signal() == syscall.SIGKILL
 }
--- a/src/internal/fuzz/worker.go
+++ b/src/internal/fuzz/worker.go
@ -72,8 +72,7 @@ func (w *worker) runFuzzing() error {
 	// Start the process.
 	if err := w.start(); err != nil {
 		// We couldn't start the worker process. We can't do anything, and it's
-		// likely that other workers can't either, so give up.
-		w.coordinator.errC <- err
+		// likely that other workers can't either, so don't try to restart.
 		return err
 	}