Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bad interactions between timeouts and build retires #10480

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions toolkit/tools/internal/logger/log.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,17 @@ func PanicOnError(err interface{}, args ...interface{}) {
}
}

// FatalOnError logs a fatal error and any message strings, then exists (while
// running any cleanup functions registered with the log package)
func FatalOnError(err interface{}, args ...interface{}) {
if err != nil {
if len(args) > 0 {
Log.Errorf(args[0].(string), args[1:]...)
}
Log.Fatalln(err)
}
}

// WarningOnError logs a warning error and any message strings
func WarningOnError(err interface{}, args ...interface{}) {
if err != nil {
Expand Down
21 changes: 17 additions & 4 deletions toolkit/tools/internal/shell/shell.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,30 @@ func CurrentEnvironment() []string {

// PermanentlyStopAllChildProcesses will send the provided signal to all processes spawned by this package,
// and all of those process's children.
// Invoking this will also block future process creation, causing the Execute methods to return an error.
// Invoking this will also block future process creation, causing the Execute methods to return an error. Be aware that
// this will block the gpg-agent cleanup mechanism from running, which may cause chroots to not unmount properly. Consider
// using StopAllChildProcesses instead.
func PermanentlyStopAllChildProcesses(signal unix.Signal) {
// Acquire the global activeCommandsMutex to ensure no
// new commands are executed during this teardown routine
logger.Log.Info("Waiting for outstanding processes to be created")
logger.Log.Info("Waiting for outstanding processes to be created before blocking all future processes")

activeCommandsMutex.Lock()
defer activeCommandsMutex.Unlock()

// Disallow future processes from being created
allowProcessCreation = false
activeCommandsMutex.Unlock()

StopAllChildProcesses(signal)
}

// StopAllChildProcesses will stop all currently running processes spawned by this package, but will not block future process creation.
func StopAllChildProcesses(signal unix.Signal) {
// Acquire the global activeCommandsMutex to ensure no
// new commands are executed during this teardown routine
logger.Log.Info("Waiting for outstanding processes to be created before stopping all child processes")

activeCommandsMutex.Lock()
defer activeCommandsMutex.Unlock()

// For every running process, issue the provided signal to its process group,
// resulting in both the process and all of its children being stopped.
Expand Down
66 changes: 45 additions & 21 deletions toolkit/tools/pkgworker/pkgworker.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,13 @@ func main() {
logger.InitBestEffort(logFlags)

rpmsDirAbsPath, err := filepath.Abs(*rpmsDirPath)
logger.PanicOnError(err, "Unable to find absolute path for RPMs directory '%s'", *rpmsDirPath)
logger.FatalOnError(err, "Unable to find absolute path for RPMs directory '%s'", *rpmsDirPath)

toolchainDirAbsPath, err := filepath.Abs(*toolchainDirPath)
logger.PanicOnError(err, "Unable to find absolute path for toolchain RPMs directory '%s'", *toolchainDirPath)
logger.FatalOnError(err, "Unable to find absolute path for toolchain RPMs directory '%s'", *toolchainDirPath)

srpmsDirAbsPath, err := filepath.Abs(*srpmsDirPath)
logger.PanicOnError(err, "Unable to find absolute path for SRPMs directory '%s'", *srpmsDirPath)
logger.FatalOnError(err, "Unable to find absolute path for SRPMs directory '%s'", *srpmsDirPath)

chrootDir := buildChrootDirPath(*workDir, *srpmFile, *runCheck)

Expand Down Expand Up @@ -118,15 +118,15 @@ func main() {
}

builtRPMs, err := buildSRPMInChroot(chrootDir, rpmsDirAbsPath, toolchainDirAbsPath, *workerTar, *srpmFile, *repoFile, *rpmmacrosFile, *outArch, defines, *noCleanup, *runCheck, *packagesToInstall, ccacheManager, *timeout)
logger.PanicOnError(err, "Failed to build SRPM '%s'. For details see log file: %s .", *srpmFile, *logFlags.LogFile)
logger.FatalOnError(err, "Failed to build SRPM '%s'. For details see log file: %s .", *srpmFile, *logFlags.LogFile)

// For regular (non-test) package builds:
// - Copy the SRPM which produced the package to the output directory.
// - Write a comma-separated list of RPMs built to stdout that can be parsed by the invoker.
// Any output from logger will be on stderr so stdout will only contain this output.
if !*runCheck {
err = copySRPMToOutput(*srpmFile, srpmsDirAbsPath)
logger.PanicOnError(err, "Failed to copy SRPM '%s' to output directory '%s'.", *srpmFile, rpmsDirAbsPath)
logger.FatalOnError(err, "Failed to copy SRPM '%s' to output directory '%s'.", *srpmFile, rpmsDirAbsPath)

fmt.Print(strings.Join(builtRPMs, ","))
}
Expand Down Expand Up @@ -171,6 +171,7 @@ func buildSRPMInChroot(chrootDir, rpmDirPath, toolchainDirPath, workerTar, srpmF
quit := make(chan bool)
go func() {
logger.Log.Infof("Building (%s).", srpmBaseName)
startTime := time.Now()

for {
select {
Expand All @@ -180,7 +181,7 @@ func buildSRPMInChroot(chrootDir, rpmDirPath, toolchainDirPath, workerTar, srpmF
}
return
case <-time.After(buildHeartbeatTimeout):
logger.Log.Infof("Heartbeat: still building (%s).", srpmBaseName)
logger.Log.Infof("Heartbeat: still building (%s) after %s.", srpmBaseName, time.Since(startTime).String())
}
}
}()
Expand Down Expand Up @@ -214,34 +215,48 @@ func buildSRPMInChroot(chrootDir, rpmDirPath, toolchainDirPath, workerTar, srpmF

err = chroot.Initialize(workerTar, extraDirs, mountPoints, true)
if err != nil {
err = fmt.Errorf("failed to initialize chroot:\n%w", err)
return
}
defer chroot.Close(noCleanup)

// Place extra files that will be needed to build into the chroot
srpmFileInChroot, err := copyFilesIntoChroot(chroot, srpmFile, repoFile, rpmmacrosFile, runCheck)
if err != nil {
err = fmt.Errorf("failed to copy files into chroot:\n%w", err)
return
}

// Run the build in a go routine so we can monitor and kill it if it takes too long.
// Run the build in a goroutine so we can monitor and kill it if it takes too long.
//
// It is important to run the timeout logic inside the chroot so that the chroot cleanup
// flow in chroot.Run() is executed if the build times out.
//
// If the timeout logic is run outside of the chroot.Run() call, the process will still
// be running in the chroot's context and the automatic chroot cleanup triggered by the
// process exiting will fail (see safechroot.go:cleanupAllChroots()). For example,
// `unmount /path/to/chroot/dev` will fail since our root is currently `/path/to/chroot`,
// and `/path/to/chroot/path/to/chroot/dev` is not a real path.
results := make(chan error)
go func() {
buildErr := chroot.Run(func() (err error) {
return buildRPMFromSRPMInChroot(srpmFileInChroot, outArch, runCheck, defines, packagesToInstall, isCCacheEnabled(ccacheManager))
})
results <- buildErr
}()

select {
case err = <-results:
case <-time.After(timeout):
logger.Log.Errorf("Timeout after %v: killing all processes in chroot...", timeout)
shell.PermanentlyStopAllChildProcesses(unix.SIGKILL)
err = fmt.Errorf("build timed out after %s", timeout)
}
err = chroot.Run(func() (err error) {
go func() {
results <- buildRPMFromSRPMInChroot(srpmFileInChroot, outArch, runCheck, defines, packagesToInstall, isCCacheEnabled(ccacheManager))
}()

var chrootErr error = nil
select {
dmcilvaney marked this conversation as resolved.
Show resolved Hide resolved
case chrootErr = <-results:
logger.Log.Debug("Build thread in chroot finished.")
case <-time.After(timeout):
logger.Log.Errorf("Timeout after %v: stopping chroot...", timeout)
shell.StopAllChildProcesses(unix.SIGKILL)
chrootErr = fmt.Errorf("build timed out after %s", timeout)
}
return chrootErr // Internal error is returned via the channel
})

if err != nil {
err = fmt.Errorf("failed to build RPM from SRPM in chroot:\n%w", err)
return
}

Expand All @@ -266,18 +281,21 @@ func buildRPMFromSRPMInChroot(srpmFile, outArch string, runCheck bool, defines m
// Convert /localrpms into a repository that a package manager can use.
err = rpmrepomanager.CreateRepo(chrootLocalRpmsDir)
if err != nil {
err = fmt.Errorf("failed to create local RPM repository:\n%w", err)
return
}

// Convert /toolchainrpms into a repository that a package manager can use.
err = rpmrepomanager.CreateRepo(chrootLocalToolchainDir)
if err != nil {
err = fmt.Errorf("failed to create toolchain RPM repository:\n%w", err)
return
}

// install any additional packages, such as build dependencies.
err = tdnfInstall(packagesToInstall)
if err != nil {
err = fmt.Errorf("failed to install additional packages:\n%w", err)
return
}

Expand All @@ -286,6 +304,7 @@ func buildRPMFromSRPMInChroot(srpmFile, outArch string, runCheck bool, defines m
logger.Log.Infof("USE_CCACHE: installing package: %s", ccachePkgName[0])
err = tdnfInstall(ccachePkgName)
if err != nil {
err = fmt.Errorf("failed to install ccache:\n%w", err)
return
}
}
Expand All @@ -296,6 +315,7 @@ func buildRPMFromSRPMInChroot(srpmFile, outArch string, runCheck bool, defines m
// build failures.
err = removeLibArchivesFromSystem()
if err != nil {
err = fmt.Errorf("failed to remove lib archives from system:\n%w", err)
return
}

Expand All @@ -305,6 +325,10 @@ func buildRPMFromSRPMInChroot(srpmFile, outArch string, runCheck bool, defines m
} else {
err = rpm.BuildRPMFromSRPM(srpmFile, outArch, defines)
}
if err != nil {
err = fmt.Errorf("failed to build/test RPM from SRPM:\n%w", err)
return
}

return
}
Expand Down
9 changes: 5 additions & 4 deletions toolkit/tools/scheduler/buildagents/chrootagent.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"fmt"
"path/filepath"
"strings"
"time"

"github.com/microsoft/azurelinux/toolkit/tools/internal/shell"
"github.com/sirupsen/logrus"
Expand Down Expand Up @@ -38,7 +39,7 @@ func (c *ChrootAgent) Initialize(config *BuildAgentConfig) (err error) {
// - outArch is the target architecture to build for.
// - runCheck is true if the package should run the "%check" section during the build
// - dependencies is a list of dependencies that need to be installed before building.
func (c *ChrootAgent) BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string) (builtFiles []string, logFile string, err error) {
func (c *ChrootAgent) BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string, allowableRuntime time.Duration) (builtFiles []string, logFile string, err error) {
// On success, pkgworker will print a comma-seperated list of all RPMs built to stdout.
// This will be the last stdout line written.
const delimiter = ","
Expand All @@ -50,7 +51,7 @@ func (c *ChrootAgent) BuildPackage(basePackageName, inputFile, logName, outArch
lastStdoutLine = strings.TrimSpace(line)
}

args := serializeChrootBuildAgentConfig(c.config, basePackageName, inputFile, logFile, outArch, runCheck, dependencies)
args := serializeChrootBuildAgentConfig(c.config, basePackageName, inputFile, logFile, outArch, runCheck, dependencies, allowableRuntime)
err = shell.NewExecBuilder(c.config.Program, args...).
StdoutCallback(onStdout).
LogLevel(logrus.TraceLevel, logrus.TraceLevel).
Expand All @@ -75,7 +76,7 @@ func (c *ChrootAgent) Close() (err error) {
}

// serializeChrootBuildAgentConfig serializes a BuildAgentConfig into arguments usable by pkgworker for the sake of building the package.
func serializeChrootBuildAgentConfig(config *BuildAgentConfig, basePackageName, inputFile, logFile, outArch string, runCheck bool, dependencies []string) (serializedArgs []string) {
func serializeChrootBuildAgentConfig(config *BuildAgentConfig, basePackageName, inputFile, logFile, outArch string, runCheck bool, dependencies []string, allowableRuntime time.Duration) (serializedArgs []string) {
serializedArgs = []string{
fmt.Sprintf("--input=%s", inputFile),
fmt.Sprintf("--work-dir=%s", config.WorkDir),
Expand All @@ -93,7 +94,7 @@ func serializeChrootBuildAgentConfig(config *BuildAgentConfig, basePackageName,
fmt.Sprintf("--log-level=%s", config.LogLevel),
fmt.Sprintf("--out-arch=%s", outArch),
fmt.Sprintf("--max-cpu=%s", config.MaxCpu),
fmt.Sprintf("--timeout=%s", config.Timeout.String()),
fmt.Sprintf("--timeout=%s", allowableRuntime),
}

if config.RpmmacrosFile != "" {
Expand Down
3 changes: 2 additions & 1 deletion toolkit/tools/scheduler/buildagents/definition.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ type BuildAgent interface {
// - outArch is the target architecture to build for.
// - runCheck is true if the package should run the "%check" section during the build
// - dependencies is a list of dependencies that need to be installed before building.
BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string) ([]string, string, error)
// - allowableRuntime is how long the package build is allowed to run.
BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string, allowableRuntime time.Duration) ([]string, string, error)

// Config returns a copy of the agent's configuration.
Config() BuildAgentConfig
Expand Down
2 changes: 1 addition & 1 deletion toolkit/tools/scheduler/buildagents/testagent.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ func (t *TestAgent) Initialize(config *BuildAgentConfig) (err error) {
}

// BuildPackage simply sleeps and then returns success for TestAgent.
func (t *TestAgent) BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string) (builtFiles []string, logFile string, err error) {
func (t *TestAgent) BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string, allowableRuntime time.Duration) (builtFiles []string, logFile string, err error) {
const sleepDuration = time.Second * 5
time.Sleep(sleepDuration)

Expand Down
2 changes: 1 addition & 1 deletion toolkit/tools/scheduler/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ func cancelOutstandingBuilds(agent buildagents.BuildAgent) {
}

// Issue a SIGINT to all children processes to allow them to gracefully exit.
shell.PermanentlyStopAllChildProcesses(unix.SIGINT)
shell.StopAllChildProcesses(unix.SIGINT)
}

// cancelBuildsOnSignal will stop any builds running on SIGINT/SIGTERM.
Expand Down
43 changes: 39 additions & 4 deletions toolkit/tools/scheduler/schedulerutils/buildworker.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package schedulerutils

import (
"bufio"
"context"
"fmt"
"os"
"path/filepath"
Expand Down Expand Up @@ -278,10 +279,27 @@ func buildSRPMFile(agent buildagents.BuildAgent, buildAttempts int, basePackageN
)

logBaseName := filepath.Base(srpmFile) + ".log"
err = retry.Run(func() (buildErr error) {
builtFiles, logFile, buildErr = agent.BuildPackage(basePackageName, srpmFile, logBaseName, outArch, runCheck, dependencies)

// Track the time the build may take, and ensure we don't exceed the maximum limit.
attemptNumber := 0
totalExecutionTimeout := agent.Config().Timeout
deadline := time.Now().Add(totalExecutionTimeout)
ctx, cancelFunc := context.WithDeadline(context.Background(), deadline)
defer cancelFunc()

wasCancelled, err := retry.RunWithLinearBackoff(ctx, func() (buildErr error) {
if attemptNumber > 0 {
logger.Log.Warnf("Build for '%s' failed %d times, retrying up to %d times.", srpmFile, attemptNumber, buildAttempts)
}
attemptNumber++

builtFiles, logFile, buildErr = agent.BuildPackage(basePackageName, srpmFile, logBaseName, outArch, runCheck, dependencies, time.Until(deadline))
return
}, buildAttempts, retryDuration)
if wasCancelled {
err = fmt.Errorf("after %d/%d attempts, the build exceeded the maximum time of %s", attemptNumber, buildAttempts, totalExecutionTimeout)
return
}

return
}
Expand All @@ -296,10 +314,23 @@ func testSRPMFile(agent buildagents.BuildAgent, checkAttempts int, basePackageNa
)

logBaseName := filepath.Base(srpmFile) + ".test.log"
err = retry.Run(func() (buildErr error) {

// Track the time the build may take, and ensure we don't exceed the maximum limit.
attemptNumber := 0
totalExecutionTimeout := agent.Config().Timeout
deadline := time.Now().Add(totalExecutionTimeout)
ctx, cancelFunc := context.WithDeadline(context.Background(), deadline)
defer cancelFunc()

wasCancelled, err := retry.RunWithLinearBackoff(ctx, func() (buildErr error) {
if attemptNumber > 0 {
logger.Log.Warnf("Test for '%s' failed %d times, retrying up to %d times.", srpmFile, attemptNumber, checkAttempts)
}
attemptNumber++

checkFailed = false

_, logFile, buildErr = agent.BuildPackage(basePackageName, srpmFile, logBaseName, outArch, runCheck, dependencies)
_, logFile, buildErr = agent.BuildPackage(basePackageName, srpmFile, logBaseName, outArch, runCheck, dependencies, time.Until(deadline))
if buildErr != nil {
logger.Log.Warnf("Test build for '%s' failed on a non-test build issue. Error: %s", srpmFile, buildErr)
return
Expand All @@ -312,6 +343,10 @@ func testSRPMFile(agent buildagents.BuildAgent, checkAttempts int, basePackageNa
}
return
}, checkAttempts, retryDuration)
if wasCancelled {
err = fmt.Errorf("after %d/%d attempts, the check exceeded the maximum time of %s", attemptNumber, checkAttempts, totalExecutionTimeout)
return
}

if checkFailed {
logger.Log.Debugf("Tests failed for '%s' after %d attempt(s).", basePackageName, checkAttempts)
Expand Down
Loading