diff --git a/misc/ios/go_darwin_arm_exec.go b/misc/ios/go_darwin_arm_exec.go index 431ddcc70b..4495f52c90 100644 --- a/misc/ios/go_darwin_arm_exec.go +++ b/misc/ios/go_darwin_arm_exec.go @@ -26,6 +26,10 @@ import ( const debug = false +var errRetry = errors.New("failed to start test harness (retry attempted)") + +var tmpdir string + func main() { log.SetFlags(0) log.SetPrefix("go_darwin_arm_exec: ") @@ -36,39 +40,39 @@ func main() { log.Fatal("usage: go_darwin_arm_exec a.out") } - if err := run(os.Args[1], os.Args[2:]); err != nil { + var err error + tmpdir, err = ioutil.TempDir("", "go_darwin_arm_exec_") + if err != nil { + log.Fatal(err) + } + + // Approximately 1 in a 100 binaries fail to start. If it happens, + // try again. These failures happen for several reasons beyond + // our control, but all of them are safe to retry as they happen + // before lldb encounters the initial getwd breakpoint. As we + // know the tests haven't started, we are not hiding flaky tests + // with this retry. + for i := 0; i < 5; i++ { + if i > 0 { + fmt.Fprintln(os.Stderr, "start timeout, trying again") + } + err = run(os.Args[1], os.Args[2:]) + if err == nil || err != errRetry { + break + } + } + if !debug { + os.RemoveAll(tmpdir) + } + if err != nil { fmt.Fprintf(os.Stderr, "go_darwin_arm_exec: %v\n", err) os.Exit(1) } } func run(bin string, args []string) (err error) { - type waitPanic struct { - err error - } - defer func() { - if r := recover(); r != nil { - if w, ok := r.(waitPanic); ok { - err = w.err - return - } - panic(r) - } - }() - - defer exec.Command("killall", "ios-deploy").Run() // cleanup - - exec.Command("killall", "ios-deploy").Run() - - tmpdir, err := ioutil.TempDir("", "go_darwin_arm_exec_") - if err != nil { - log.Fatal(err) - } - if !debug { - defer os.RemoveAll(tmpdir) - } - appdir := filepath.Join(tmpdir, "gotest.app") + os.RemoveAll(appdir) if err := os.MkdirAll(appdir, 0755); err != nil { return err } @@ -109,9 +113,31 @@ func run(bin string, args []string) (err error) { return fmt.Errorf("codesign: %v", err) } - if err := os.Chdir(tmpdir); err != nil { + oldwd, err := os.Getwd() + if err != nil { return err } + if err := os.Chdir(filepath.Join(appdir, "..")); err != nil { + return err + } + defer os.Chdir(oldwd) + + type waitPanic struct { + err error + } + defer func() { + if r := recover(); r != nil { + if w, ok := r.(waitPanic); ok { + err = w.err + return + } + panic(r) + } + }() + + defer exec.Command("killall", "ios-deploy").Run() // cleanup + + exec.Command("killall", "ios-deploy").Run() // ios-deploy invokes lldb to give us a shell session with the app. cmd = exec.Command( @@ -175,11 +201,11 @@ func run(bin string, args []string) (err error) { w.printBuf() return fmt.Errorf("failed (stage %s): %v", stage, err) case i := <-w.find(str, timeout): - if i >= 0 { - w.clearTo(i + len(str)) - } else { - log.Printf("timed out on stage %s, continuing", stage) + if i < 0 { + log.Printf("timed out on stage %q, retrying", stage) + return errRetry } + w.clearTo(i + len(str)) return nil } } @@ -192,7 +218,11 @@ func run(bin string, args []string) (err error) { // Wait for installation and connection. if err := waitFor("ios-deploy before run", "(lldb) connect\r\nProcess 0 connected\r\n", 0); err != nil { - return err + // Retry if we see a rare and longstanding ios-deploy bug. + // https://github.com/phonegap/ios-deploy/issues/11 + // Assertion failed: (AMDeviceStartService(device, CFSTR("com.apple.debugserver"), &gdbfd, NULL) == 0) + log.Printf("%v, retrying", err) + return errRetry } // Script LLDB. Oh dear. @@ -205,9 +235,21 @@ func run(bin string, args []string) (err error) { do(`breakpoint set -n getwd`) // in runtime/cgo/gcc_darwin_arm.go fmt.Fprintln(lldb, `run`) - // Sometimes we don't see "reason = breakpoint", so we time out - // and try to continue. - if err := waitFor("br getwd", "stop reason = breakpoint", 10*time.Second); err != nil { + if err := waitFor("br getwd", "stop reason = breakpoint", 20*time.Second); err != nil { + // At this point we see several flaky errors from the iOS + // build infrastructure. The most common is never reaching + // the breakpoint, which we catch with a timeout. Very + // occasionally lldb can produce errors like: + // + // Breakpoint 1: no locations (pending). + // WARNING: Unable to resolve breakpoint to any actual locations. + // + // As no actual test code has been executed by this point, + // we treat all errors as recoverable. + if err != errRetry { + log.Printf("%v, retrying", err) + err = errRetry + } return err } if err := waitFor("br getwd prompt", "(lldb)", 0); err != nil {