Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bad interactions between timeouts and build retires #10480

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions toolkit/tools/internal/logger/log.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,17 @@ func PanicOnError(err interface{}, args ...interface{}) {
}
}

// FatalOnError logs a fatal error and any message strings, then exists (while
// running any cleanup functions registered with the log package)
func FatalOnError(err interface{}, args ...interface{}) {
if err != nil {
if len(args) > 0 {
Log.Errorf(args[0].(string), args[1:]...)
}
Log.Fatalln(err)
}
}

// WarningOnError logs a warning error and any message strings
func WarningOnError(err interface{}, args ...interface{}) {
if err != nil {
Expand Down
14 changes: 8 additions & 6 deletions toolkit/tools/pkgworker/pkgworker.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,13 @@ func main() {
logger.InitBestEffort(logFlags)

rpmsDirAbsPath, err := filepath.Abs(*rpmsDirPath)
logger.PanicOnError(err, "Unable to find absolute path for RPMs directory '%s'", *rpmsDirPath)
logger.FatalOnError(err, "Unable to find absolute path for RPMs directory '%s'", *rpmsDirPath)

toolchainDirAbsPath, err := filepath.Abs(*toolchainDirPath)
logger.PanicOnError(err, "Unable to find absolute path for toolchain RPMs directory '%s'", *toolchainDirPath)
logger.FatalOnError(err, "Unable to find absolute path for toolchain RPMs directory '%s'", *toolchainDirPath)

srpmsDirAbsPath, err := filepath.Abs(*srpmsDirPath)
logger.PanicOnError(err, "Unable to find absolute path for SRPMs directory '%s'", *srpmsDirPath)
logger.FatalOnError(err, "Unable to find absolute path for SRPMs directory '%s'", *srpmsDirPath)

chrootDir := buildChrootDirPath(*workDir, *srpmFile, *runCheck)

Expand Down Expand Up @@ -118,15 +118,15 @@ func main() {
}

builtRPMs, err := buildSRPMInChroot(chrootDir, rpmsDirAbsPath, toolchainDirAbsPath, *workerTar, *srpmFile, *repoFile, *rpmmacrosFile, *outArch, defines, *noCleanup, *runCheck, *packagesToInstall, ccacheManager, *timeout)
logger.PanicOnError(err, "Failed to build SRPM '%s'. For details see log file: %s .", *srpmFile, *logFlags.LogFile)
logger.FatalOnError(err, "Failed to build SRPM '%s'. For details see log file: %s .", *srpmFile, *logFlags.LogFile)

// For regular (non-test) package builds:
// - Copy the SRPM which produced the package to the output directory.
// - Write a comma-separated list of RPMs built to stdout that can be parsed by the invoker.
// Any output from logger will be on stderr so stdout will only contain this output.
if !*runCheck {
err = copySRPMToOutput(*srpmFile, srpmsDirAbsPath)
logger.PanicOnError(err, "Failed to copy SRPM '%s' to output directory '%s'.", *srpmFile, rpmsDirAbsPath)
logger.FatalOnError(err, "Failed to copy SRPM '%s' to output directory '%s'.", *srpmFile, rpmsDirAbsPath)

fmt.Print(strings.Join(builtRPMs, ","))
}
Expand Down Expand Up @@ -171,6 +171,7 @@ func buildSRPMInChroot(chrootDir, rpmDirPath, toolchainDirPath, workerTar, srpmF
quit := make(chan bool)
go func() {
logger.Log.Infof("Building (%s).", srpmBaseName)
startTime := time.Now()

for {
select {
Expand All @@ -180,7 +181,7 @@ func buildSRPMInChroot(chrootDir, rpmDirPath, toolchainDirPath, workerTar, srpmF
}
return
case <-time.After(buildHeartbeatTimeout):
logger.Log.Infof("Heartbeat: still building (%s).", srpmBaseName)
logger.Log.Infof("Heartbeat: still building (%s) after %s.", srpmBaseName, time.Since(startTime).String())
}
}
}()
Expand Down Expand Up @@ -238,6 +239,7 @@ func buildSRPMInChroot(chrootDir, rpmDirPath, toolchainDirPath, workerTar, srpmF
case <-time.After(timeout):
logger.Log.Errorf("Timeout after %v: killing all processes in chroot...", timeout)
shell.PermanentlyStopAllChildProcesses(unix.SIGKILL)
time.Sleep(5 * time.Second) // If we don't wait, the chroot mounts may still be busy
dmcilvaney marked this conversation as resolved.
Show resolved Hide resolved
err = fmt.Errorf("build timed out after %s", timeout)
}

Expand Down
9 changes: 5 additions & 4 deletions toolkit/tools/scheduler/buildagents/chrootagent.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"fmt"
"path/filepath"
"strings"
"time"

"github.com/microsoft/azurelinux/toolkit/tools/internal/shell"
"github.com/sirupsen/logrus"
Expand Down Expand Up @@ -38,7 +39,7 @@ func (c *ChrootAgent) Initialize(config *BuildAgentConfig) (err error) {
// - outArch is the target architecture to build for.
// - runCheck is true if the package should run the "%check" section during the build
// - dependencies is a list of dependencies that need to be installed before building.
func (c *ChrootAgent) BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string) (builtFiles []string, logFile string, err error) {
func (c *ChrootAgent) BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string, allowableRuntime time.Duration) (builtFiles []string, logFile string, err error) {
// On success, pkgworker will print a comma-seperated list of all RPMs built to stdout.
// This will be the last stdout line written.
const delimiter = ","
Expand All @@ -50,7 +51,7 @@ func (c *ChrootAgent) BuildPackage(basePackageName, inputFile, logName, outArch
lastStdoutLine = strings.TrimSpace(line)
}

args := serializeChrootBuildAgentConfig(c.config, basePackageName, inputFile, logFile, outArch, runCheck, dependencies)
args := serializeChrootBuildAgentConfig(c.config, basePackageName, inputFile, logFile, outArch, runCheck, dependencies, allowableRuntime)
err = shell.NewExecBuilder(c.config.Program, args...).
StdoutCallback(onStdout).
LogLevel(logrus.TraceLevel, logrus.TraceLevel).
Expand All @@ -75,7 +76,7 @@ func (c *ChrootAgent) Close() (err error) {
}

// serializeChrootBuildAgentConfig serializes a BuildAgentConfig into arguments usable by pkgworker for the sake of building the package.
func serializeChrootBuildAgentConfig(config *BuildAgentConfig, basePackageName, inputFile, logFile, outArch string, runCheck bool, dependencies []string) (serializedArgs []string) {
func serializeChrootBuildAgentConfig(config *BuildAgentConfig, basePackageName, inputFile, logFile, outArch string, runCheck bool, dependencies []string, allowableRuntime time.Duration) (serializedArgs []string) {
serializedArgs = []string{
fmt.Sprintf("--input=%s", inputFile),
fmt.Sprintf("--work-dir=%s", config.WorkDir),
Expand All @@ -93,7 +94,7 @@ func serializeChrootBuildAgentConfig(config *BuildAgentConfig, basePackageName,
fmt.Sprintf("--log-level=%s", config.LogLevel),
fmt.Sprintf("--out-arch=%s", outArch),
fmt.Sprintf("--max-cpu=%s", config.MaxCpu),
fmt.Sprintf("--timeout=%s", config.Timeout.String()),
fmt.Sprintf("--timeout=%s", allowableRuntime),
}

if config.RpmmacrosFile != "" {
Expand Down
3 changes: 2 additions & 1 deletion toolkit/tools/scheduler/buildagents/definition.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ type BuildAgent interface {
// - outArch is the target architecture to build for.
// - runCheck is true if the package should run the "%check" section during the build
// - dependencies is a list of dependencies that need to be installed before building.
BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string) ([]string, string, error)
// - allowableRuntime is how long the package build is allowed to run.
BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string, allowableRuntime time.Duration) ([]string, string, error)

// Config returns a copy of the agent's configuration.
Config() BuildAgentConfig
Expand Down
2 changes: 1 addition & 1 deletion toolkit/tools/scheduler/buildagents/testagent.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ func (t *TestAgent) Initialize(config *BuildAgentConfig) (err error) {
}

// BuildPackage simply sleeps and then returns success for TestAgent.
func (t *TestAgent) BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string) (builtFiles []string, logFile string, err error) {
func (t *TestAgent) BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string, allowableRuntime time.Duration) (builtFiles []string, logFile string, err error) {
const sleepDuration = time.Second * 5
time.Sleep(sleepDuration)

Expand Down
43 changes: 39 additions & 4 deletions toolkit/tools/scheduler/schedulerutils/buildworker.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package schedulerutils

import (
"bufio"
"context"
"fmt"
"os"
"path/filepath"
Expand Down Expand Up @@ -278,10 +279,27 @@ func buildSRPMFile(agent buildagents.BuildAgent, buildAttempts int, basePackageN
)

logBaseName := filepath.Base(srpmFile) + ".log"
err = retry.Run(func() (buildErr error) {
builtFiles, logFile, buildErr = agent.BuildPackage(basePackageName, srpmFile, logBaseName, outArch, runCheck, dependencies)

// Track the time the build may take, and ensure we don't exceed the maximum limit.
attemptNumber := 0
totalExecutionTimeout := agent.Config().Timeout
deadline := time.Now().Add(totalExecutionTimeout)
ctx, cancelFunc := context.WithDeadline(context.Background(), deadline)
defer cancelFunc()

wasCancelled, err := retry.RunWithLinearBackoff(ctx, func() (buildErr error) {
if attemptNumber > 0 {
logger.Log.Warnf("Build for '%s' failed %d times, retrying up to %d times.", srpmFile, attemptNumber, buildAttempts)
}
attemptNumber++

builtFiles, logFile, buildErr = agent.BuildPackage(basePackageName, srpmFile, logBaseName, outArch, runCheck, dependencies, time.Until(deadline))
return
}, buildAttempts, retryDuration)
if wasCancelled {
err = fmt.Errorf("after %d/%d attempts, the build exceeded the maximum time of %s", attemptNumber, buildAttempts, totalExecutionTimeout)
return
}

return
}
Expand All @@ -296,10 +314,23 @@ func testSRPMFile(agent buildagents.BuildAgent, checkAttempts int, basePackageNa
)

logBaseName := filepath.Base(srpmFile) + ".test.log"
err = retry.Run(func() (buildErr error) {

// Track the time the build may take, and ensure we don't exceed the maximum limit.
attemptNumber := 0
totalExecutionTimeout := agent.Config().Timeout
deadline := time.Now().Add(totalExecutionTimeout)
ctx, cancelFunc := context.WithDeadline(context.Background(), deadline)
defer cancelFunc()

wasCancelled, err := retry.RunWithLinearBackoff(ctx, func() (buildErr error) {
if attemptNumber > 0 {
logger.Log.Warnf("Test for '%s' failed %d times, retrying up to %d times.", srpmFile, attemptNumber, checkAttempts)
}
attemptNumber++

checkFailed = false

_, logFile, buildErr = agent.BuildPackage(basePackageName, srpmFile, logBaseName, outArch, runCheck, dependencies)
_, logFile, buildErr = agent.BuildPackage(basePackageName, srpmFile, logBaseName, outArch, runCheck, dependencies, time.Until(deadline))
if buildErr != nil {
logger.Log.Warnf("Test build for '%s' failed on a non-test build issue. Error: %s", srpmFile, buildErr)
return
Expand All @@ -312,6 +343,10 @@ func testSRPMFile(agent buildagents.BuildAgent, checkAttempts int, basePackageNa
}
return
}, checkAttempts, retryDuration)
if wasCancelled {
err = fmt.Errorf("after %d/%d attempts, the check exceeded the maximum time of %s", attemptNumber, checkAttempts, totalExecutionTimeout)
return
}

if checkFailed {
logger.Log.Debugf("Tests failed for '%s' after %d attempt(s).", basePackageName, checkAttempts)
Expand Down
Loading