Skip to content

Commit

Permalink
add retry_on_deployment
Browse files Browse the repository at this point in the history
  • Loading branch information
LiliDeng committed Sep 13, 2024
1 parent a1358de commit bda89e4
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 21 deletions.
7 changes: 7 additions & 0 deletions lisa/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ def __init__(
self,
is_predefined: bool,
warn_as_error: bool,
retry_on_deployment: int,
id_: int,
runbook: schema.Environment,
) -> None:
Expand All @@ -173,6 +174,7 @@ def __init__(
self.is_new: bool = True
self.id: str = str(id_)
self.warn_as_error = warn_as_error
self.retry_on_deployment = retry_on_deployment
self.platform: Optional[Platform] = None
self.log = get_logger("env", self.name)
self.source_test_result: Optional[TestResult] = None
Expand Down Expand Up @@ -380,6 +382,7 @@ def get_guest_environment(self) -> "Environment":
env = Environment(
is_predefined=self.is_predefined,
warn_as_error=self.warn_as_error,
retry_on_deployment=self.retry_on_deployment,
id_=self._raw_id,
runbook=runbook,
)
Expand Down Expand Up @@ -462,9 +465,11 @@ class Environments(EnvironmentsDict):
def __init__(
self,
warn_as_error: bool = False,
retry_on_deployment: int = 0,
) -> None:
super().__init__()
self.warn_as_error = warn_as_error
self.retry_on_deployment = retry_on_deployment

def get_or_create(self, requirement: EnvironmentSpace) -> Optional[Environment]:
result: Optional[Environment] = None
Expand Down Expand Up @@ -507,6 +512,7 @@ def from_runbook(
env = Environment(
is_predefined=is_predefined_runbook,
warn_as_error=self.warn_as_error,
retry_on_deployment=self.retry_on_deployment,
id_=id_,
runbook=copied_runbook,
)
Expand All @@ -523,6 +529,7 @@ def load_environments(
if root_runbook:
environments = Environments(
warn_as_error=root_runbook.warn_as_error,
retry_on_deployment=root_runbook.retry_on_deployment,
)

environments_runbook = root_runbook.environments
Expand Down
4 changes: 4 additions & 0 deletions lisa/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -1116,6 +1116,10 @@ def cleanup(self) -> None:
for node in self._list:
node.cleanup()

def clear(self) -> None:
self._list = []
self._default = None

def append(self, node: Node) -> None:
self._list.append(node)

Expand Down
56 changes: 36 additions & 20 deletions lisa/runners/lisa_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,30 +290,46 @@ def _prepare_environments(self) -> None:
def _deploy_environment_task(
self, environment: Environment, test_results: List[TestResult]
) -> None:
try:
max_retries = environment.retry_on_deployment + 1
for attempt in range(max_retries):
try:
self.platform.deploy_environment(environment)
assert (
environment.status == EnvironmentStatus.Deployed
), f"actual: {environment.status}"
self._reset_awaitable_timer("deploy")
except ResourceAwaitableException as identifier:
if self._is_awaitable_timeout("deploy"):
try:
# Attempt to deploy the environment
self.platform.deploy_environment(environment)
assert (
environment.status == EnvironmentStatus.Deployed
), f"actual: {environment.status}"
self._reset_awaitable_timer("deploy")
break
except ResourceAwaitableException as identifier:
if self._is_awaitable_timeout("deploy"):
self._log.info(
f"[{environment.name}] timeout on waiting for more "
f"resource: {identifier}, skip assigning case."
)
raise SkippedException(identifier)
else:
# rerun prepare to calculate resource again.
environment.status = EnvironmentStatus.New
except Exception as identifier:
if attempt + 1 < max_retries:
# Log retry information and continue to the next attempt
self._log.info(
f"[{environment.name}] timeout on waiting for more resource: "
f"{identifier}, skip assigning case."
f"[{environment.name}] Deployment failed, retrying... "
f"(Attempt {attempt + 1}/{max_retries - 1})"
)
raise SkippedException(identifier)
else:
# rerun prepare to calculate resource again.
environment.status = EnvironmentStatus.New
except Exception as identifier:
self._attach_failed_environment_to_result(
environment=environment,
result=test_results[0],
exception=identifier,
)
self._delete_environment_task(environment=environment, test_results=[])
self._prepare_environment(environment)
else:
# Final attempt failed; handle the failure
self._attach_failed_environment_to_result(
environment=environment,
result=test_results[0],
exception=identifier,
)
self._delete_environment_task(
environment=environment, test_results=[]
)

def _initialize_environment_task(
self, environment: Environment, test_results: List[TestResult]
Expand Down
7 changes: 7 additions & 0 deletions lisa/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -1320,6 +1320,12 @@ def reload_requirements(self) -> None:
class EnvironmentRoot:
warn_as_error: bool = field(default=False)
environments: List[Environment] = field(default_factory=list)
retry_on_deployment: int = field(
default=0,
metadata=field_metadata(
field_function=fields.Int, validate=validate.Range(min=0)
),
)


@dataclass_json()
Expand Down Expand Up @@ -1477,6 +1483,7 @@ class TestCase(BaseTestCaseFilter):
field_function=fields.Int, validate=validate.Range(min=0)
),
)

# each case with this rule will be run in a new environment.
use_new_environment: bool = False
# Once it's set, failed test result will be rewrite to success
Expand Down
6 changes: 5 additions & 1 deletion selftests/azure/test_prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,11 @@ def load_environment(
_ = node_req.get_extended_runbook(common.AzureNodeSchema, AZURE)
runbook._original_nodes_requirement.append(node_req)
environment = Environment(
is_predefined=True, warn_as_error=False, id_=0, runbook=runbook
is_predefined=True,
warn_as_error=False,
id_=0,
runbook=runbook,
retry_on_deployment=0,
)

return environment
Expand Down

0 comments on commit bda89e4

Please sign in to comment.