microsoft · dmcilvaney · Sep 19, 2024 · Sep 18, 2024 · Sep 18, 2024 · Sep 18, 2024
@@ -163,6 +163,17 @@ func PanicOnError(err interface{}, args ...interface{}) {
 	}
 }
 
+// FatalOnError logs a fatal error and any message strings, then exists (while
+// running any cleanup functions registered with the log package)
+func FatalOnError(err interface{}, args ...interface{}) {
+	if err != nil {
+		if len(args) > 0 {
+			Log.Errorf(args[0].(string), args[1:]...)
+		}
+		Log.Fatalln(err)
+	}
+}
+
 // WarningOnError logs a warning error and any message strings
 func WarningOnError(err interface{}, args ...interface{}) {
 	if err != nil {

@@ -74,13 +74,13 @@ func main() {
 	logger.InitBestEffort(logFlags)
 
 	rpmsDirAbsPath, err := filepath.Abs(*rpmsDirPath)
-	logger.PanicOnError(err, "Unable to find absolute path for RPMs directory '%s'", *rpmsDirPath)
+	logger.FatalOnError(err, "Unable to find absolute path for RPMs directory '%s'", *rpmsDirPath)
 
 	toolchainDirAbsPath, err := filepath.Abs(*toolchainDirPath)
-	logger.PanicOnError(err, "Unable to find absolute path for toolchain RPMs directory '%s'", *toolchainDirPath)
+	logger.FatalOnError(err, "Unable to find absolute path for toolchain RPMs directory '%s'", *toolchainDirPath)
 
 	srpmsDirAbsPath, err := filepath.Abs(*srpmsDirPath)
-	logger.PanicOnError(err, "Unable to find absolute path for SRPMs directory '%s'", *srpmsDirPath)
+	logger.FatalOnError(err, "Unable to find absolute path for SRPMs directory '%s'", *srpmsDirPath)
 
 	chrootDir := buildChrootDirPath(*workDir, *srpmFile, *runCheck)
 
@@ -118,15 +118,15 @@ func main() {
 	}
 
 	builtRPMs, err := buildSRPMInChroot(chrootDir, rpmsDirAbsPath, toolchainDirAbsPath, *workerTar, *srpmFile, *repoFile, *rpmmacrosFile, *outArch, defines, *noCleanup, *runCheck, *packagesToInstall, ccacheManager, *timeout)
-	logger.PanicOnError(err, "Failed to build SRPM '%s'. For details see log file: %s .", *srpmFile, *logFlags.LogFile)
+	logger.FatalOnError(err, "Failed to build SRPM '%s'. For details see log file: %s .", *srpmFile, *logFlags.LogFile)
 
 	// For regular (non-test) package builds:
 	// - Copy the SRPM which produced the package to the output directory.
 	// - Write a comma-separated list of RPMs built to stdout that can be parsed by the invoker.
 	//   Any output from logger will be on stderr so stdout will only contain this output.
 	if !*runCheck {
 		err = copySRPMToOutput(*srpmFile, srpmsDirAbsPath)
-		logger.PanicOnError(err, "Failed to copy SRPM '%s' to output directory '%s'.", *srpmFile, rpmsDirAbsPath)
+		logger.FatalOnError(err, "Failed to copy SRPM '%s' to output directory '%s'.", *srpmFile, rpmsDirAbsPath)
 
 		fmt.Print(strings.Join(builtRPMs, ","))
 	}
@@ -171,6 +171,7 @@ func buildSRPMInChroot(chrootDir, rpmDirPath, toolchainDirPath, workerTar, srpmF
 	quit := make(chan bool)
 	go func() {
 		logger.Log.Infof("Building (%s).", srpmBaseName)
+		startTime := time.Now()
 
 		for {
 			select {
@@ -180,7 +181,7 @@ func buildSRPMInChroot(chrootDir, rpmDirPath, toolchainDirPath, workerTar, srpmF
 				}
 				return
 			case <-time.After(buildHeartbeatTimeout):
-				logger.Log.Infof("Heartbeat: still building (%s).", srpmBaseName)
+				logger.Log.Infof("Heartbeat: still building (%s) after %s.", srpmBaseName, time.Since(startTime).String())
 			}
 		}
 	}()
@@ -238,6 +239,7 @@ func buildSRPMInChroot(chrootDir, rpmDirPath, toolchainDirPath, workerTar, srpmF
 	case <-time.After(timeout):
 		logger.Log.Errorf("Timeout after %v: killing all processes in chroot...", timeout)
 		shell.PermanentlyStopAllChildProcesses(unix.SIGKILL)
+		time.Sleep(5 * time.Second) // If we don't wait, the chroot mounts may still be busy
 		err = fmt.Errorf("build timed out after %s", timeout)
 	}
 

@@ -7,6 +7,7 @@ import (
 	"fmt"
 	"path/filepath"
 	"strings"
+	"time"
 
 	"github.com/microsoft/azurelinux/toolkit/tools/internal/shell"
 	"github.com/sirupsen/logrus"
@@ -38,7 +39,7 @@ func (c *ChrootAgent) Initialize(config *BuildAgentConfig) (err error) {
 // - outArch is the target architecture to build for.
 // - runCheck is true if the package should run the "%check" section during the build
 // - dependencies is a list of dependencies that need to be installed before building.
-func (c *ChrootAgent) BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string) (builtFiles []string, logFile string, err error) {
+func (c *ChrootAgent) BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string, allowableRuntime time.Duration) (builtFiles []string, logFile string, err error) {
 	// On success, pkgworker will print a comma-seperated list of all RPMs built to stdout.
 	// This will be the last stdout line written.
 	const delimiter = ","
@@ -50,7 +51,7 @@ func (c *ChrootAgent) BuildPackage(basePackageName, inputFile, logName, outArch
 		lastStdoutLine = strings.TrimSpace(line)
 	}
 
-	args := serializeChrootBuildAgentConfig(c.config, basePackageName, inputFile, logFile, outArch, runCheck, dependencies)
+	args := serializeChrootBuildAgentConfig(c.config, basePackageName, inputFile, logFile, outArch, runCheck, dependencies, allowableRuntime)
 	err = shell.NewExecBuilder(c.config.Program, args...).
 		StdoutCallback(onStdout).
 		LogLevel(logrus.TraceLevel, logrus.TraceLevel).
@@ -75,7 +76,7 @@ func (c *ChrootAgent) Close() (err error) {
 }
 
 // serializeChrootBuildAgentConfig serializes a BuildAgentConfig into arguments usable by pkgworker for the sake of building the package.
-func serializeChrootBuildAgentConfig(config *BuildAgentConfig, basePackageName, inputFile, logFile, outArch string, runCheck bool, dependencies []string) (serializedArgs []string) {
+func serializeChrootBuildAgentConfig(config *BuildAgentConfig, basePackageName, inputFile, logFile, outArch string, runCheck bool, dependencies []string, allowableRuntime time.Duration) (serializedArgs []string) {
 	serializedArgs = []string{
 		fmt.Sprintf("--input=%s", inputFile),
 		fmt.Sprintf("--work-dir=%s", config.WorkDir),
@@ -93,7 +94,7 @@ func serializeChrootBuildAgentConfig(config *BuildAgentConfig, basePackageName,
 		fmt.Sprintf("--log-level=%s", config.LogLevel),
 		fmt.Sprintf("--out-arch=%s", outArch),
 		fmt.Sprintf("--max-cpu=%s", config.MaxCpu),
-		fmt.Sprintf("--timeout=%s", config.Timeout.String()),
+		fmt.Sprintf("--timeout=%s", allowableRuntime),
 	}
 
 	if config.RpmmacrosFile != "" {

@@ -48,7 +48,8 @@ type BuildAgent interface {
 	// - outArch is the target architecture to build for.
 	// - runCheck is true if the package should run the "%check" section during the build
 	// - dependencies is a list of dependencies that need to be installed before building.
-	BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string) ([]string, string, error)
+	// - allowableRuntime is how long the package build is allowed to run.
+	BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string, allowableRuntime time.Duration) ([]string, string, error)
 
 	// Config returns a copy of the agent's configuration.
 	Config() BuildAgentConfig

@@ -28,7 +28,7 @@ func (t *TestAgent) Initialize(config *BuildAgentConfig) (err error) {
 }
 
 // BuildPackage simply sleeps and then returns success for TestAgent.
-func (t *TestAgent) BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string) (builtFiles []string, logFile string, err error) {
+func (t *TestAgent) BuildPackage(basePackageName, inputFile, logName, outArch string, runCheck bool, dependencies []string, allowableRuntime time.Duration) (builtFiles []string, logFile string, err error) {
 	const sleepDuration = time.Second * 5
 	time.Sleep(sleepDuration)
 

@@ -5,6 +5,7 @@ package schedulerutils
 
 import (
 	"bufio"
+	"context"
 	"fmt"
 	"os"
 	"path/filepath"
@@ -278,10 +279,27 @@ func buildSRPMFile(agent buildagents.BuildAgent, buildAttempts int, basePackageN
 	)
 
 	logBaseName := filepath.Base(srpmFile) + ".log"
-	err = retry.Run(func() (buildErr error) {
-		builtFiles, logFile, buildErr = agent.BuildPackage(basePackageName, srpmFile, logBaseName, outArch, runCheck, dependencies)
+
+	// Track the time the build may take, and ensure we don't exceed the maximum limit.
+	attemptNumber := 0
+	totalExecutionTimeout := agent.Config().Timeout
+	deadline := time.Now().Add(totalExecutionTimeout)
+	ctx, cancelFunc := context.WithDeadline(context.Background(), deadline)
+	defer cancelFunc()
+
+	wasCancelled, err := retry.RunWithLinearBackoff(ctx, func() (buildErr error) {
+		if attemptNumber > 0 {
+			logger.Log.Warnf("Build for '%s' failed %d times, retrying up to %d times.", srpmFile, attemptNumber, buildAttempts)
+		}
+		attemptNumber++
+
+		builtFiles, logFile, buildErr = agent.BuildPackage(basePackageName, srpmFile, logBaseName, outArch, runCheck, dependencies, time.Until(deadline))
 		return
 	}, buildAttempts, retryDuration)
+	if wasCancelled {
+		err = fmt.Errorf("after %d/%d attempts, the build exceeded the maximum time of %s", attemptNumber, buildAttempts, totalExecutionTimeout)
+		return
+	}
 
 	return
 }
@@ -296,10 +314,23 @@ func testSRPMFile(agent buildagents.BuildAgent, checkAttempts int, basePackageNa
 	)
 
 	logBaseName := filepath.Base(srpmFile) + ".test.log"
-	err = retry.Run(func() (buildErr error) {
+
+	// Track the time the build may take, and ensure we don't exceed the maximum limit.
+	attemptNumber := 0
+	totalExecutionTimeout := agent.Config().Timeout
+	deadline := time.Now().Add(totalExecutionTimeout)
+	ctx, cancelFunc := context.WithDeadline(context.Background(), deadline)
+	defer cancelFunc()
+
+	wasCancelled, err := retry.RunWithLinearBackoff(ctx, func() (buildErr error) {
+		if attemptNumber > 0 {
+			logger.Log.Warnf("Test for '%s' failed %d times, retrying up to %d times.", srpmFile, attemptNumber, checkAttempts)
+		}
+		attemptNumber++
+
 		checkFailed = false
 
-		_, logFile, buildErr = agent.BuildPackage(basePackageName, srpmFile, logBaseName, outArch, runCheck, dependencies)
+		_, logFile, buildErr = agent.BuildPackage(basePackageName, srpmFile, logBaseName, outArch, runCheck, dependencies, time.Until(deadline))
 		if buildErr != nil {
 			logger.Log.Warnf("Test build for '%s' failed on a non-test build issue. Error: %s", srpmFile, buildErr)
 			return
@@ -312,6 +343,10 @@ func testSRPMFile(agent buildagents.BuildAgent, checkAttempts int, basePackageNa
 		}
 		return
 	}, checkAttempts, retryDuration)
+	if wasCancelled {
+		err = fmt.Errorf("after %d/%d attempts, the check exceeded the maximum time of %s", attemptNumber, checkAttempts, totalExecutionTimeout)
+		return
+	}
 
 	if checkFailed {
 		logger.Log.Debugf("Tests failed for '%s' after %d attempt(s).", basePackageName, checkAttempts)